exiawsh commited on Sep 20, 2025

Commit

cf6360b

verified ·

1 Parent(s): 60f1e49

上传模型

Browse files

Files changed (21) hide show

.gitattributes +1 -0
added_tokens.json +39 -0
all_results.json +9 -0
chat_template.json +3 -0
config.json +79 -0
configuration_eagle3_vl.py +95 -0
generation_config.json +6 -0
image_processing_eagle3_vl_fast.py +221 -0
model.safetensors +3 -0
modeling_eagle3_vl.py +416 -0
modeling_siglip2.py +1419 -0
preprocessor_config.json +36 -0
processing_eagle3_vl.py +868 -0
processor_config.json +14 -0
special_tokens_map.json +42 -0
tokenizer_config.json +344 -0
train_results.json +9 -0
trainer_state.json +0 -0
training_args.bin +3 -0
training_log.txt +3 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+training_log.txt filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "</box>": 151673,
+  "</img>": 151671,
+  "</interval>": 151679,
+  "</quad>": 151675,
+  "</ref>": 151677,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<IMG_CONTEXT>": 151669,
+  "<box>": 151672,
+  "<img>": 151670,
+  "<interval>": 151678,
+  "<quad>": 151674,
+  "<ref>": 151676,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 5.461479216500315e+20,
+    "train_loss": 0.09150631395949999,
+    "train_runtime": 9502.0046,
+    "train_samples": 166521,
+    "train_samples_per_second": 17.525,
+    "train_steps_per_second": 0.617
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}<image {{ image_count.value }}>{% endif %}<image-{{ image_count.value }}>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}<video {{ video_count.value }}>{% endif %}<video-{{ video_count.value }}>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+  "_attn_implementation": "flash_attention_2",
+  "_commit_hash": null,
+  "architectures": [
+    "Eagle3_VLForConditionalGeneration"
+  ],
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": false,
+  "image_token_index": 151669,
+  "loss_version": "efficient_v2_cp_head",
+  "max_dynamic_tiles": 12,
+  "min_dynamic_tiles": 1,
+  "mlp_checkpoint": false,
+  "mlp_connector_layers": 2,
+  "model_type": "eagle_3_vl",
+  "pad2square": false,
+  "select_layer": -1,
+  "text_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "Qwen/Qwen3-1.7B",
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 28,
+    "model_type": "qwen3",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": false,
+    "use_sliding_window": false,
+    "vocab_size": 151680
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_pixel_shuffle": true,
+  "use_thumbnail": false,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "attention_dropout": 0.0,
+    "full_attention_indexes": [
+      7,
+      14,
+      21,
+      26
+    ],
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip2_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_patches": 256,
+    "patch_size": 14,
+    "torch_dtype": "bfloat16",
+    "use_rope": false,
+    "use_windows_attn": true,
+    "window_size": 14
+  }
+}

configuration_eagle3_vl.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import copy
+from eaglevl.model.phi3.configuration_phi3 import Phi3Config
+from eaglevl.model.llama.configuration_llama import LlamaConfig
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
+from .modeling_siglip2 import Siglip2VisionConfig
+from eaglevl.model.c_radio.radio_model import RADIOConfig
+logger = logging.get_logger(__name__)
+class Eagle3_VLConfig(PretrainedConfig):
+    model_type = 'eagle_3_vl'
+    is_composition = True
+    sub_configs = {"vision_config": SiglipVisionConfig, "text_config": Qwen2Config}
+    def __init__(
+            self,
+            vision_config=None,
+            text_config=None,
+            use_backbone_lora=0,
+            use_llm_lora=0,
+            pad2square=False,
+            select_layer=-4,
+            downsample_ratio=0.5,
+            template=None,
+            loss_version='v1',
+            mlp_checkpoint=False,
+            image_token_index=151667,
+            **kwargs):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = {'model_type': 'siglip_vision_model'}
+            logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
+        if text_config is None:
+            text_config = {'architectures': ['Qwen2ForCausalLM']}
+            logger.info('text_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
+        if vision_config['model_type'] == 'siglip_vision_model':
+            self.vision_config = SiglipVisionConfig(**vision_config)
+        elif vision_config['model_type'] == 'siglip2_vision_model':
+            self.vision_config = Siglip2VisionConfig(**vision_config)
+        elif vision_config['model_type'] == 'intern_vit_6b':
+            self.vision_config = InternVisionConfig(**vision_config)
+        elif vision_config['model_type'] == 'radio':
+            self.vision_config = RADIOConfig(**vision_config)
+        else:
+            raise ValueError('Unsupported model_type: {}'.format(vision_config['model_type']))
+        if text_config['architectures'][0] == 'LlamaForCausalLM':
+            self.text_config = LlamaConfig(**text_config)
+        elif text_config['architectures'][0] == 'Phi3ForCausalLM':
+            self.text_config = Phi3Config(**text_config)
+        elif text_config['architectures'][0] == 'Qwen2ForCausalLM':
+            self.text_config = Qwen2Config(**text_config)
+        elif text_config['architectures'][0] == 'Qwen3ForCausalLM':
+            self.text_config = Qwen3Config(**text_config)
+        else:
+            raise ValueError('Unsupported architecture: {}'.format(text_config['architectures'][0]))
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.mlp_checkpoint = mlp_checkpoint
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.loss_version = loss_version
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.image_token_index = image_token_index
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output['vision_config'] = self.vision_config.to_dict()
+        output['text_config'] = self.text_config.to_dict()
+        output['model_type'] = self.__class__.model_type
+        output['use_backbone_lora'] = self.use_backbone_lora
+        output['use_llm_lora'] = self.use_llm_lora
+        output['select_layer'] = self.select_layer
+        output['downsample_ratio'] = self.downsample_ratio
+        output['template'] = self.template
+        output['image_token_index'] = self.image_token_index
+        output['_attn_implementation'] = self._attn_implementation
+        output['_attn_implementation_autoset'] = self._attn_implementation_autoset
+        return output

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "transformers_version": "4.51.0"
+}

image_processing_eagle3_vl_fast.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# --------------------------------------------------------
+# NVIDIA
+# Copyright (c) 2025 NVIDIA
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/image_processing_llava_onevision_fast.py
+from typing import List, Optional, Union
+from transformers.image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
+from transformers.image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    divide_to_patches,
+    group_images_by_shape,
+    reorder_images,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    IMAGENET_STANDARD_MEAN, # 0.5, 0.5, 0.5
+    IMAGENET_STANDARD_STD, # 0.5, 0.5, 0.5
+    ChannelDimension,
+    ImageInput,
+    VideoInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+    make_flat_list_of_images,
+    make_batched_videos,
+    validate_kwargs
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import TensorType, add_start_docstrings, is_torch_available, is_torchvision_v2_available
+if is_torch_available():
+    import torch
+if is_torchvision_v2_available():
+    from transformers.image_utils import pil_torch_interpolation_mapping
+    from torchvision.transforms.v2 import functional as F
+else:
+    from torchvision.transforms import functional as F
+def crop(img: torch.Tensor, left: int, top: int, right: int, bottom: int) -> torch.Tensor:
+    """Crop the given numpy array.
+    Args:
+        img (torch.Tensor): Image to be cropped. Format should be (C, H, W).
+        left (int): The left coordinate of the crop box.
+        top (int): The top coordinate of the crop box.
+        right (int): The right coordinate of the crop box.
+        bottom (int): The bottom coordinate of the crop box.
+    Returns:
+        torch.Tensor: Cropped image.
+    """
+    if not isinstance(img, torch.Tensor):
+        raise TypeError('img should be torch.Tensor. Got {}'.format(type(img)))
+    if img.ndim not in [2, 3]:
+        raise ValueError('Image should have 2 or 3 dimensions. Got {}'.format(img.ndim))
+    img_height = img.shape[1]
+    img_width = img.shape[2]
+    if top < 0 or left < 0 or bottom > img_height or right > img_width:
+        raise ValueError('Crop coordinates out of bounds')
+    if top >= bottom or left >= right:
+        raise ValueError('Invalid crop coordinates')
+    return img[:, top:bottom, left:right]
+class Eagle3_VLFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    do_pad: Optional[bool]
+@add_start_docstrings(
+    "Constructs a fast ConvNeXT image processor. Based on [`SiglipImageProcessor`] with incorporation of processing each video frame.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        image_grid_pinpoints (`List[List[int]]`, *optional*):
+            A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+            based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+            method. Not used for processing videos.
+        do_pad (`bool`, *optional*):
+            Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+            number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+    """,
+)
+class Eagle3_VLImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 448, "width": 448}
+    default_to_square = False
+    crop_size = None
+    do_resize = True
+    do_center_crop = None
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_pad = True
+    valid_kwargs = Eagle3_VLFastImageProcessorKwargs
+    model_input_names = ["pixel_values_videos"]
+    def __init__(self, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+            do_pad (`bool`, *optional*):
+                    Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                    number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+    ) -> ImageInput:
+        """
+        Prepare the images structure for processing.
+        Args:
+            images (`ImageInput`):
+                The input images to process.
+        Returns:
+            `ImageInput`: The images with a valid nesting.
+        """
+        return make_flat_list_of_images(images)
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        do_pad: bool,
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        image_sizes = [get_image_size(image, channel_dim=ChannelDimension.FIRST) for image in images]
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images)
+        return BatchFeature(
+            data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
+        )
+    def preprocess(self, images: ImageInput, videos: VideoInput=None, **kwargs: Unpack[Eagle3_VLFastImageProcessorKwargs]) -> BatchFeature:
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_kwargs.__annotations__.keys())
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+        # Extract parameters that are only used for preparing the input images
+        do_convert_rgb = kwargs.pop("do_convert_rgb")
+        input_data_format = kwargs.pop("input_data_format")
+        device = kwargs.pop("device")
+        # Prepare input images
+        if images is not None:
+            images = self._prepare_input_images(
+                images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+            )
+        if videos is not None:
+            videos = self._prepare_input_images(
+                images=videos, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+            )
+        # Update kwargs that need further processing before being validated
+        kwargs = self._further_process_kwargs(**kwargs)
+        # Validate kwargs
+        self._validate_preprocess_kwargs(**kwargs)
+        # torch resize uses interpolation instead of resample
+        resample = kwargs.pop("resample")
+        kwargs["interpolation"] = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
+        )
+        # Pop kwargs that are not needed in _preprocess
+        kwargs.pop("default_to_square")
+        kwargs.pop("data_format")
+        if images is not None:
+            return self._preprocess(images, **kwargs)
+        elif videos is not None:
+            return self._preprocess(videos, **kwargs)
+__all__ = ["Eagle3_VLImageProcessorFast"]

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddd1319bb78e5dd3852b19fe670926403a9dd22127927b66fd019efdcac43fb4
+size 4944136416

modeling_eagle3_vl.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# --------------------------------------------------------
+# NVIDIA
+# Copyright (c) 2025 NVIDIA
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import warnings
+import inspect
+from typing import Any, List, Optional, Tuple, Union
+import torch
+from torch import nn
+import torch.distributed as dist
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+import torch.utils.checkpoint as cp
+from transformers.models.siglip.modeling_siglip import SiglipVisionModel
+from .modeling_siglip2 import Siglip2VisionModel
+from peft import LoraConfig, get_peft_model
+from transformers.generation import GenerationMixin
+from transformers import GenerationConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from .configuration_eagle3_vl import Eagle3_VLConfig
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from collections import defaultdict
+logger = logging.get_logger(__name__)
+# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/modeling_llava_onevision.py#L241C1-L280C1
+EAGLE3_VL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Eagle3_VLConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Eagle3_VL Model outputting raw hidden-states without any specific head on top.",
+    EAGLE3_VL_START_DOCSTRING,
+)
+class Eagle3_VLPreTrainedModel(PreTrainedModel):
+    config_class = Eagle3_VLConfig
+    base_model_prefix = "model"
+    main_input_name = 'input_ids'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer", "LlamaDecoderLayer" ,"Siglip2EncoderLayer", "SiglipEncoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    _supports_quantized_cache = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Eagle3_VLForConditionalGeneration(Eagle3_VLPreTrainedModel, GenerationMixin):
+    config_class = Eagle3_VLConfig
+    def __init__(self, config: Eagle3_VLConfig, vision_model=None, language_model=None):
+        super().__init__(config)
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.downsample_ratio = config.downsample_ratio
+        self.loss_version = config.loss_version
+        self.mlp_checkpoint = config.mlp_checkpoint
+        logger.info(f'mlp_checkpoint: {self.mlp_checkpoint}')
+        if vision_model is not None:
+            self.vision_model = vision_model
+        else:
+            if config.vision_config.model_type == 'intern_vit_6b':
+                self.vision_model = InternVisionModel(config.vision_config)
+            elif config.vision_config.model_type == 'siglip_vision_model':
+                config.vision_config._attn_implementation = 'flash_attention_2'
+                self.vision_model = SiglipVisionModel(config.vision_config)
+            elif config.vision_config.model_type == 'siglip2_vision_model':
+                config.vision_config._attn_implementation = 'flash_attention_2'
+                self.vision_model = Siglip2VisionModel(config.vision_config)
+            elif config.vision_config.model_type == 'radio':
+                self.vision_model = RADIOModel(config.vision_config)
+        if language_model is not None:
+            self.language_model = language_model
+        else:
+            if config.text_config.architectures[0] == 'LlamaForCausalLM':
+                self.language_model = LlamaForCausalLM(config.text_config)
+            elif config.text_config.architectures[0] == 'Phi3ForCausalLM':
+                self.language_model = Phi3ForCausalLM(config.text_config)
+            elif config.text_config.architectures[0] == 'Qwen2ForCausalLM':
+                assert config.text_config._attn_implementation == 'flash_attention_2', f"Qwen2 must use flash_attention_2 but got {config.text_config._attn_implementation}"
+                self.language_model = Qwen2ForCausalLM(config.text_config)
+            elif config.text_config.architectures[0] == 'Qwen3ForCausalLM':
+                assert config.text_config._attn_implementation == 'flash_attention_2', f"Qwen3 must use flash_attention_2 but got {config.text_config._attn_implementation}"
+                self.language_model = Qwen3ForCausalLM(config.text_config)
+            else:
+                raise NotImplementedError(f'{config.text_config.architectures[0]} is not implemented.')
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+        self.mlp1 = nn.Sequential(
+                nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+                nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
+                nn.GELU(),
+                nn.Linear(llm_hidden_size, llm_hidden_size)
+            )
+        self.image_token_index = config.image_token_index
+        self.neftune_alpha = None
+        if config.use_backbone_lora:
+            self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=2 * config.use_backbone_lora)
+        self.use_llm_lora = config.use_llm_lora
+        if config.use_llm_lora:
+            self.wrap_llm_lora(r=config.use_llm_lora, lora_alpha=2 * config.use_llm_lora)
+        self.check_forward_kwargs()
+    def check_forward_kwargs(self):
+        # We intentionally avoid using **kwargs in forward because Hugging Face Transformers
+        # has special handling for functions with **kwargs parameters that would affect
+        # how our model is processed during training and inference.
+        forward_params = inspect.signature(self.forward).parameters
+        assert not any(k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values())
+    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.out_proj',
+                            'mlp.fc1', 'mlp.fc2'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+        self.vision_model.print_trainable_parameters()
+    def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
+                            'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            task_type='CAUSAL_LM'
+        )
+        self.language_model = get_peft_model(self.language_model, lora_config)
+        self.language_model.enable_input_require_grads()
+        self.language_model.print_trainable_parameters()
+        self.use_llm_lora = True
+    def forward(
+            self,
+            pixel_values: List[torch.FloatTensor],
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            image_flags: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        num_images = len(pixel_values)
+        if image_flags is not None:
+            image_flags = image_flags.view(-1)
+        vit_embeds = self.extract_feature(pixel_values, image_flags)
+        B, N, C = input_embeds.shape
+        input_embeds = input_embeds.reshape(B * N, C)
+        input_ids = input_ids.reshape(B * N)
+        selected = (input_ids == self.image_token_index)
+        try:
+            input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds
+        except Exception as e:
+            print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
+                  f'vit_embeds.shape={vit_embeds.shape}')
+            n_token = selected.sum()
+            input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token]
+        input_embeds = input_embeds.reshape(B, N, C)
+        outputs = self.language_model(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        logits = outputs.logits
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def pixel_shuffle_back(self, vit_embeds, spatial_shapes):
+        # Assume vit_embeds: [1, 15020, 1152], spatial_shapes: [(h1,w1), (h2,w2), ...] length 64
+        B, N, C = vit_embeds.shape
+        shapes = spatial_shapes.tolist()  # List of (h, w)
+        # 1) Split at once
+        lengths = [h * w for (h, w) in shapes]               # Number of patches per image
+        slices = torch.split(vit_embeds.view(-1, C), lengths, dim=0)
+        # slices[i]: [hi*wi, C]
+        # 2) Convert to [C, H, W]
+        features = [
+            sl.transpose(0, 1).reshape(C, h, w)
+            for sl, (h, w) in zip(slices, shapes)
+        ]  # Each item [C, hi, wi]
+        # visualize_tensor_list(features, 'features.jpg')
+        # 3) Group by scale and batch unshuffle
+        down_feats = [None] * len(features)
+        grouped: dict = defaultdict(list)
+        for idx, (h, w) in enumerate(shapes):
+            grouped[(h, w)].append(idx)
+        for (h, w), idxs in grouped.items():
+            # Stack features of the same scale -> [n, C, H, W]
+            grp = torch.stack([features[i] for i in idxs], dim=0)
+            # Pixel Unshuffle at once
+            out = F.pixel_unshuffle(grp, downscale_factor=int(1/self.downsample_ratio))  # [n, C*4, H//2, W//2]
+            out = out.flatten(start_dim=2).transpose(1, 2)  # [n, H//2 * W//2, C*4]
+            # Split back to respective positions
+            for i, feat in zip(idxs, out):
+                down_feats[i] = feat
+        down_feats = torch.cat(down_feats, dim=0).unsqueeze(0)
+        return down_feats, (spatial_shapes*self.downsample_ratio).to(torch.int32)
+    def mask_valid_tokens(self, vit_embeds, spatial_shapes, image_flags):
+        """
+        vit_embeds: Tensor, shape [1, N, C] or [N, C]
+        spatial_shapes: Tensor of shape [num_images, 2], each row is (H, W)
+        image_flags: list[int], e.g. [1, 0, 1, ...]
+        Returns:
+        valid_tokens: Tensor [num_valid_tokens, C]
+        """
+        lengths = spatial_shapes[:, 0] * spatial_shapes[:, 1]  # [num_images]
+        valid_mask = []
+        for flag, length in zip(image_flags, lengths):
+            valid_mask.extend([flag] * length)
+        valid_mask = torch.tensor(valid_mask, dtype=torch.bool, device=vit_embeds.device)
+        valid_tokens = vit_embeds[valid_mask]  # [num_valid_tokens, C]
+        return valid_tokens
+    def extract_feature(self, pixel_values, image_flags=None):
+        if self.select_layer == -1:
+            vision_model_output = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=False,
+                return_dict=True)
+            if hasattr(vision_model_output, 'last_hidden_state'):
+                vit_embeds = vision_model_output.last_hidden_state
+            if hasattr(vision_model_output, 'spatial_shapes'):
+                spatial_shapes = vision_model_output.spatial_shapes
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values,
+                output_hidden_states=True,
+                return_dict=True).hidden_states[self.select_layer]
+        vit_embeds, spatial_shapes = self.pixel_shuffle_back(vit_embeds, spatial_shapes)
+        if self.mlp_checkpoint and vit_embeds.requires_grad:
+            vit_embeds = cp.checkpoint(self.mlp1, vit_embeds)
+        else:
+            vit_embeds = self.mlp1(vit_embeds)
+        B, N, C = vit_embeds.shape
+        vit_embeds = vit_embeds.reshape(B * N, C)
+        if image_flags is not None and any(image_flags==0):
+            vit_embeds = self.mask_valid_tokens(vit_embeds, spatial_shapes, image_flags)
+        return vit_embeds
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            input_ids: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            visual_features: Optional[torch.FloatTensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            image_sizes: Optional[List[Tuple[int, int]]] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        if pixel_values is not None:
+            if visual_features is not None:
+                vit_embeds = visual_features
+            else:
+                pixel_values = [each.to(self.device) for each in pixel_values]
+                import time
+                torch.cuda.synchronize()
+                begin_time = time.time()
+                for _ in range(10):
+                    vit_embeds = self.extract_feature(pixel_values)
+                torch.cuda.synchronize()
+                end_time = time.time()
+                print(f'extract_feature time: {(end_time - begin_time) / 10}')
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+            B, N, C = input_embeds.shape
+            input_embeds = input_embeds.reshape(B * N, C)
+            input_ids = input_ids.reshape(B * N)
+            selected = (input_ids == self.config.image_token_index)
+            assert selected.sum() != 0
+            input_embeds[selected] = vit_embeds.to(input_embeds.device)
+            input_embeds = input_embeds.reshape(B, N, C)
+        else:
+            input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        if 'use_cache' not in generate_kwargs:
+            generate_kwargs['use_cache'] = True
+        outputs = self.language_model.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            generation_config=generation_config,
+            output_hidden_states=output_hidden_states,
+            **generate_kwargs,
+        )
+        return outputs
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.set_decoder
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.language_model.get_decoder()

modeling_siglip2.py ADDED Viewed

	@@ -0,0 +1,1419 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/siglip2/modular_siglip2.py.
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union, List
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.siglip2.configuration_siglip2 import Siglip2Config, Siglip2TextConfig
+from collections import defaultdict
+from itertools import accumulate
+from math import isqrt
+from typing import Dict
+logger = logging.get_logger(__name__)
+import inspect
+import os
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal, logging
+from transformers.integrations.flash_attention import flash_attention_forward as original_flash_attention_forward
+flash_241 = is_flash_attn_greater_or_equal("2.4.1")
+deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+logger = logging.get_logger(__name__)
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+    from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_varlen_qkvpacked_func
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+def _flash_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    query_length: int,
+    is_causal: bool,
+    dropout: float = 0.0,
+    position_ids: Optional[torch.Tensor] = None,
+    softmax_scale: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    use_top_left_mask: bool = False,
+    softcap: Optional[float] = None,
+    deterministic: bool = None,
+    cu_seq_lens_q: Optional[torch.LongTensor] = None,
+    cu_seq_lens_k: Optional[torch.LongTensor] = None,
+    max_length_q: Optional[int] = None,
+    max_length_k: Optional[int] = None,
+    target_dtype: Optional[torch.dtype] = None,
+    **kwargs,
+    ):
+    """
+    Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+    first unpad the input, then computes the attention scores and pad the final attention scores.
+    Args:
+        query_states (`torch.Tensor`):
+            Input query states to be passed to Flash Attention API
+        key_states (`torch.Tensor`):
+            Input key states to be passed to Flash Attention API
+        value_states (`torch.Tensor`):
+            Input value states to be passed to Flash Attention API
+        attention_mask (`torch.Tensor`):
+            The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+            position of padding tokens and 1 for the position of non-padding tokens.
+        dropout (`int`, *optional*):
+            Attention dropout
+        softmax_scale (`float`, *optional*):
+            The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        use_sliding_windows (`bool`, *optional*):
+            Whether to activate sliding window attention.
+    """
+    if not use_top_left_mask:
+        causal = is_causal
+    else:
+        # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+        causal = is_causal and query_length != 1
+    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+    use_sliding_windows = (
+        _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window
+    )
+    flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
+    if flash_241:
+        if deterministic is None:
+            deterministic = deterministic_g
+        flash_kwargs["deterministic"] = deterministic
+    if softcap is not None:
+        flash_kwargs["softcap"] = softcap
+    attn_output = flash_attn_varlen_func(
+        query_states[0],
+        key_states[0],
+        value_states[0],
+        cu_seqlens_q=cu_seq_lens_q,
+        cu_seqlens_k=cu_seq_lens_k,
+        max_seqlen_q=max_length_q,
+        max_seqlen_k=max_length_k,
+        dropout_p=dropout,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        **flash_kwargs,
+        )
+    return attn_output
+from transformers.utils import is_flash_attn_greater_or_equal_2_10
+_use_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+def flash_attention_forward_for_packing(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor]=None,
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    seq_len_list: Optional[List[int]] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # This is before the transpose
+    seq_len = query.shape[2]
+    # FA2 uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+    # therefore the input hidden states gets silently casted in float32. Hence, we need
+    # cast them back in the correct dtype just to be sure everything works as expected.
+    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+    # in fp32. (usually our RMSNorm modules handle it correctly)
+    target_dtype = None
+    if query.dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(module.config, "_pre_quantization_dtype"):
+            target_dtype = module.config._pre_quantization_dtype
+        else:
+            target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
+    # FA2 always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+    kwargs.pop("is_causal", None)
+    cu_seqlens = F.pad(torch.cumsum(torch.tensor(seq_len_list, device=query.device, dtype=torch.int32), dim=0), (1, 0))
+    cu_seqlens = cu_seqlens.to(torch.int32)
+    max_seq_len = max(seq_len_list)
+    attn_output = _flash_attention_forward(
+        query,
+        key,
+        value,
+        attention_mask,
+        query_length=seq_len,
+        is_causal=module.is_causal,
+        dropout=dropout,
+        softmax_scale=scaling,
+        sliding_window=sliding_window,
+        softcap=softcap,
+        use_top_left_mask=_use_top_left_mask,
+        target_dtype=target_dtype,
+        cu_seq_lens_q=cu_seqlens,
+        cu_seq_lens_k=cu_seqlens,
+        max_length_q=max_seq_len,
+        max_length_k=max_seq_len,
+        **kwargs,
+    )
+    return attn_output.squeeze(0), None
+# General docstring
+_CONFIG_FOR_DOC = "Siglip2Config"
+class Siglip2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
+    Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
+    [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        num_patches (`int`, *optional*, defaults to 256):
+            The number of patches in the image with the size of (`patch_size`, `patch_size`).
+            The image is resized to fill maximum of this number of patches, and to preserve
+            the aspect ratio. In case the resulted number of patches is lower, the image is
+            padded in "patch" dimension.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel
+    >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
+    >>> configuration = Siglip2VisionConfig()
+    >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
+    >>> model = Siglip2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "siglip2_vision_model"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=1152,
+        intermediate_size=4304,
+        num_hidden_layers=27,
+        num_attention_heads=16,
+        num_channels=3,
+        num_patches=256,
+        patch_size=14, # manully modified
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        window_size=14, #
+        full_attention_indexes=[7, 14, 21, 26],
+        use_rope=True,
+        use_windows_attn=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.num_patches = num_patches
+        self.window_size = window_size
+        self.full_attention_indexes = full_attention_indexes
+        self.use_windows_attn = use_windows_attn
+        self.use_rope = use_rope
+@dataclass
+class Siglip2VisionOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+    spatial_shapes: Optional[torch.LongTensor] = None
+def convert_image_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.Tensor":
+    """
+    Convert 3D tensor image of shape (num_channels, image_height, image_width) into 2D tensor of patches of shape
+    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
+    """
+    num_channels, image_height, image_width = image.shape
+    num_patches_height = image_height // patch_size
+    num_patches_width = image_width // patch_size
+    patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
+    patched_image = patched_image.permute(1, 3, 2, 4, 0)
+    patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
+    return patched_image
+def convert_images_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.Tensor":
+    """
+    Convert 4D tensor image of shape (batch_size, num_channels, image_height, image_width) into 2D tensor of patches of shape
+    (batch_size, num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
+    """
+    batch_size, num_channels, image_height, image_width = image.shape
+    assert image_height % patch_size == 0 and image_width % patch_size == 0, f"image_height % patch_size == 0 and image_width % patch_size == 0"
+    num_patches_height = image_height // patch_size
+    num_patches_width = image_width // patch_size
+    patched_image = image.reshape(batch_size, num_channels, num_patches_height, patch_size, num_patches_width, patch_size)
+    patched_image = patched_image.permute(0, 2, 4, 3, 5, 1) # (batch_size, num_patches_height, num_patches_width, patch_size, patch_size, num_channels)
+    patched_image = patched_image.reshape(batch_size * num_patches_height * num_patches_width, -1)
+    return patched_image
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.window_size = config.window_size
+        self.patch_embedding = nn.Linear(
+            in_features=config.num_channels * self.patch_size * self.patch_size,
+            out_features=self.embed_dim,
+        )
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+    def split_patch_embeddings_to_windows_with_meta(self, patch_embeds, batch_hw, window_size):
+        """
+        Args:
+            patch_embeds: Tensor, shape (1, sum(H_i*W_i), C)
+            batch_hw:    List[(H_i, W_i)]
+            window_size: int
+        Returns:
+            windows_tensor: Tensor, shape (total_windows, window_size*window_size, C)
+            win_meta_list:      List[dict] with keys:
+                - img_idx:   index in batch_hw
+                - patch_hw:  original (H, W)
+                - win_xy:    (h0, w0) 左上角相对于原图
+                - win_hw:    原图内有效窗口大小 (h_eff, w_eff)
+        """
+        # 1. 计算每张图在 flat tensor 中的起始位置
+        batch_hw = batch_hw.tolist()
+        counts = [H * W for (H, W) in batch_hw]
+        starts = [0] + list(accumulate(counts))[:-1]
+        # 2. 按 (H,W) 分组，同一尺寸一起处理
+        size2info = defaultdict(list)
+        for img_idx, ((H, W), start) in enumerate(zip(batch_hw, starts)):
+            size2info[(H, W)].append((img_idx, start))
+        all_windows = []
+        all_meta    = []
+        # print(size2info)
+        # 3. 对每个尺寸组做 batch unfold + pad
+        for (H, W), info in size2info.items():
+            H, W = int(H), int(W)
+            B = len(info)
+            C = patch_embeds.shape[-1]
+            img_idxs, img_starts = zip(*info)
+            # 3.1 取出并 reshape 成 (B, C, H, W)
+            imgs = []
+            for st in img_starts:
+                flat = patch_embeds[0, st: st + H * W]            # (H*W, C)
+                imgs.append(flat.transpose(0,1).reshape(C, H, W))
+            batch_tensor = torch.stack(imgs, dim=0)            # (B, C, H, W)
+            # 3.2 计算 pad 大小 (bottom, right)，保证能被 window_size 整除
+            pad_h = (window_size - H % window_size) % window_size
+            pad_w = (window_size - W % window_size) % window_size
+            # pad 格式： (left, right, top, bottom) for last two dims
+            batch_padded = F.pad(batch_tensor, (0, pad_w, 0, pad_h))
+            H_pad, W_pad = H + pad_h, W + pad_w
+            n_h = H_pad // window_size
+            n_w = W_pad // window_size
+            n_windows = n_h * n_w
+            # 3.3 batched unfold -> (B, C*ws*ws, n_windows)
+            patches_unf = F.unfold(
+                batch_padded,
+                kernel_size=(window_size, window_size),
+                stride=(window_size, window_size)
+            )
+            # 3.4 reshape到 (B*n_windows, ws*ws, C)
+            patches = (
+                patches_unf
+                .view(B, C, window_size * window_size, n_windows)  # (B, C, ws*ws, n_win)
+                .permute(0, 3, 2, 1)                               # (B, n_win, ws*ws, C)
+                .reshape(-1, window_size * window_size, C)        # (B*n_win, ws*ws, C)
+            )
+            all_windows.append(patches)
+            # 3.5 生成 meta：记录原图内有效窗口大小
+            for b, img_idx in enumerate(img_idxs):
+                for win_id in range(n_windows):
+                    i, j = divmod(win_id, n_w)
+                    h0, w0 = i * window_size, j * window_size
+                    # 在原图内的实际结束坐标
+                    h1 = min(h0 + window_size, H)
+                    w1 = min(w0 + window_size, W)
+                    all_meta.append({
+                        'img_idx':  img_idx,
+                        'patch_hw': (H, W),
+                        'win_xy':   (h0, w0),
+                        'win_hw':   (h1 - h0, w1 - w0),  # 有效区域大小
+                    })
+        # 4. 拼接并根据 img_idx + win_xy 排序，恢复输入顺序
+        sorted_idx = sorted(
+            range(len(all_meta)),
+            key=lambda k: (
+                all_meta[k]['img_idx'],
+                all_meta[k]['win_xy'][0],
+                all_meta[k]['win_xy'][1]
+            )
+        )
+        all_windows = torch.cat(all_windows, dim=0)
+        all_windows = all_windows[sorted_idx]
+        win_meta_list      = [all_meta[i] for i in sorted_idx]
+        windows_list = []
+        for meta, win in zip(win_meta_list, all_windows):
+            h_eff, w_eff = meta['win_hw']
+            valid_num = h_eff * w_eff
+            # 只保留真正来自原图的 patch tokens
+            if valid_num  == window_size * window_size:
+                windows_list.append(win)
+            else:
+                win = win.view(window_size, window_size, -1)[:h_eff, :w_eff, :].reshape(h_eff * w_eff, -1)
+                windows_list.append(win)  # shape (valid_num, C)
+        # 如果你需要一个单一 tensor，可以再 cat 一次：
+        all_tokens = torch.cat(windows_list, dim=0).unsqueeze(0)  # shape (sum(valid_num), C)
+        # 1. 先重算每张图在原始 flat tensor 中的起始位置
+        counts = [H * W for H, W in batch_hw]
+        starts = [0] + list(accumulate(counts))[:-1]
+        total_patches = sum(counts)
+        # 2. 构造映射：mapping[orig_idx] = new_idx
+        mapping = [None] * total_patches
+        offset = 0  # all_tokens 维度上的游标
+        for meta in win_meta_list:
+            img_idx = meta['img_idx']
+            H, W      = meta['patch_hw']
+            h0, w0    = meta['win_xy']
+            h_eff, w_eff = meta['win_hw']
+            base = starts[img_idx]
+            # 对该窗口内所有真正来自原图的 patch token 计算映射
+            for u in range(h_eff):
+                for v in range(w_eff):
+                    # 原始 flat 坐标
+                    orig_idx = base + (h0+u) * W + (w0) + v
+                    # 在 all_tokens 里的位置：在该窗口区段里按 row-major 展平
+                    p = u * w_eff + v
+                    mapping[orig_idx] = offset + p
+                # 窗口结束后，offset 推进该窗口的有效 token 数
+            offset += h_eff * w_eff
+        reverse_mapping = torch.tensor(mapping, dtype=torch.long)
+        return all_tokens, win_meta_list, reverse_mapping
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+        resulted_positional_embeddings = []
+        # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+        # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+        for i in range(batch_size):
+            # (1, dim, height, width) -> (1, dim, target_height, target_width)
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+            # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+            # Cast to original dtype
+            resized_embeddings = resized_embeddings.to(source_dtype)
+            resulted_positional_embeddings.append(resized_embeddings)
+        return torch.cat(resulted_positional_embeddings, dim=0).unsqueeze(0)
+    def get_spatial_shapes(self, bchw_list: List[torch.Tensor]) -> torch.Tensor:
+        hw_list = []
+        for shape in bchw_list:
+            b, _, h, w = shape
+            hw_list.extend([(h//self.patch_size, w//self.patch_size)] * b)
+        hw_tensor = torch.tensor(hw_list)
+        return hw_tensor
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, num_channels, height, width)
+        """
+        bchw_list = [each.shape for each in pixel_values]
+        pixel_values = torch.cat([convert_images_to_patches(each, self.patch_size) for each in pixel_values], dim=0)
+        # Apply patch embeddings to already patchified pixel values
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+        # Get positional resized and padded positional embeddings
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        spatial_shapes = self.get_spatial_shapes(bchw_list)
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes
+        )
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds + resized_positional_embeddings
+        windows_tensor, win_meta_list, reverse_mapping = self.split_patch_embeddings_to_windows_with_meta(embeddings, spatial_shapes, self.window_size)
+        return windows_tensor, win_meta_list, spatial_shapes, reverse_mapping
+class Rope2DPosEmb(nn.Module):
+    """
+    copy from https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/blob/main/modeling_kimi_vl.py#L324
+    2D rotary position embedding with multi-resolution support.
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+    def __init__(self, dim: int, max_height: int, max_width: int, theta_base=10000, window_size=14):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.window_size = window_size
+        self.freqs_cis = None
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+    def _precompute_freqs_cis(self, device: torch.device) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+    def get_freqs_cis(self, win_meta_list: List[Dict], device: torch.device) -> torch.Tensor:
+        """
+        Args:
+            win_meta_list (List[Dict]): window meta list
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        if self.freqs_cis is None:
+            self.freqs_cis = self._precompute_freqs_cis(device)
+        # assert all xy <512
+        assert all(win_meta['win_xy'][0] + win_meta['win_hw'][0] < 512 and win_meta['win_xy'][1] + win_meta['win_hw'][1] < 512 for win_meta in win_meta_list)
+        freqs_cis = torch.cat([self.freqs_cis[win_meta['win_xy'][0]:win_meta['win_xy'][0] + win_meta['win_hw'][0], win_meta['win_xy'][1]: win_meta['win_xy'][1] + win_meta['win_hw'][1]].reshape(-1, self.dim // 2) for win_meta in win_meta_list], dim=0)
+        freqs_cis = freqs_cis.unsqueeze(0)
+        return freqs_cis
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.use_windows_attn = config.use_windows_attn
+        self.use_rope = config.use_rope
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+        win_meta_list: Optional[List[Dict]] = None,
+        windows_attn: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, seq_length, embed_dim = hidden_states.shape
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim) # .transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim) # .transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        if self.use_rope:
+            queries, keys = apply_rope(queries, keys, rope_freqs_cis)
+        queries = queries.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            if self.config._attn_implementation == "flash_attention_2":
+                from transformers.modeling_utils import AttentionInterface
+                AttentionInterface._global_mapping['flash_attention_2_packing'] = flash_attention_forward_for_packing
+                setattr(AttentionInterface, 'flash_attention_2_packing', flash_attention_forward_for_packing)
+                attention_interface = ALL_ATTENTION_FUNCTIONS['flash_attention_2_packing']
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if windows_attn and self.use_windows_attn:
+            seq_len_list = [win_meta['win_hw'][0] * win_meta['win_hw'][1] for win_meta in win_meta_list]
+        else:
+            mapper = defaultdict(lambda: 0)
+            for win_meta in win_meta_list:
+                mapper[win_meta['img_idx']]  += win_meta['win_hw'][0] * win_meta['win_hw'][1]
+            seq_len_list = [mapper[i] for i in range(len(mapper))]
+        attention_mask = None
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+            seq_len_list=seq_len_list,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+class Siglip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = Siglip2Attention(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+        win_meta_list: Optional[List[Dict]] = None,
+        windows_attn: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            output_attentions=output_attentions,
+            rope_freqs_cis=rope_freqs_cis,
+            win_meta_list=win_meta_list,
+            windows_attn=windows_attn,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Siglip2EncoderLayer`].
+    Args:
+        config: Siglip2Config
+    """
+    def __init__(self, config: Siglip2Config):
+        super().__init__()
+        self.config = config
+        self.rope_2d = Rope2DPosEmb(
+            config.hidden_size // config.num_attention_heads, 512, 512, config.window_size
+        )
+        self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.full_attention_indexes = config.full_attention_indexes
+    # Ignore copy
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        win_meta_list: Optional[List[Dict]] = None,
+        spatial_shapes: Optional[torch.Tensor] = None,
+    ) -> BaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        rope_freqs_cis = self.rope_2d.get_freqs_cis(win_meta_list=win_meta_list, device=inputs_embeds.device)
+        hidden_states = inputs_embeds
+        for win_idx, encoder_layer in enumerate(self.layers):
+            if win_idx not in self.full_attention_indexes:
+                windows_attn = True
+            else:
+                windows_attn = False
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    output_attentions,
+                    rope_freqs_cis,
+                    win_meta_list,
+                    windows_attn
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    output_attentions=output_attentions,
+                    rope_freqs_cis=rope_freqs_cis,
+                    win_meta_list=win_meta_list,
+                    windows_attn=windows_attn
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+def reconstruct_patch_embeddings(last_hidden_state: torch.Tensor, win_meta_list: list[dict], spatial_shapes: torch.Tensor) -> torch.Tensor:
+    idx_map = build_idx_map(win_meta_list, spatial_shapes)
+    last_hidden_state = last_hidden_state[:, idx_map, :]
+    return last_hidden_state
+SIGLIP2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Siglip2VisionEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        if self.use_head:
+            self.head = Siglip2MultiheadAttentionPoolingHead(config)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        windows_tensor, win_meta_list, spatial_shapes, reverse_mapping = self.embeddings(pixel_values)
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=windows_tensor,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            win_meta_list=win_meta_list,
+            spatial_shapes=spatial_shapes,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        last_hidden_state = last_hidden_state[:, reverse_mapping, :]
+        return Siglip2VisionOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            spatial_shapes=spatial_shapes,
+        )
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+SIGLIP2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Siglip2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class Siglip2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Siglip2Config
+    base_model_prefix = "siglip2"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Siglip2TextEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2VisionEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2MultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Siglip2VisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, Siglip2Config)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, Siglip2Attention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, Siglip2MLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, Siglip2MultiheadAttentionPoolingHead):
+            nn.init.xavier_uniform_(module.probe.data)
+            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, Siglip2Model):
+            logit_scale_init = torch.log(torch.tensor(1.0))
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, Siglip2ForImageClassification):
+            nn.init.normal_(
+                module.classifier.weight,
+                std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class Siglip2MultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+        self.num_heads = config.num_attention_heads
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        if attention_mask is not None:
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+@add_start_docstrings(
+    """The vision model from Siglip2 without any head or projection on top.""",
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2VisionModel(Siglip2PreTrainedModel):
+    config_class = Siglip2VisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__(config)
+        self.vision_model = Siglip2VisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Siglip2VisionModel
+        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+__all__ = [
+    "Siglip2VisionModel",
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_eagle3_vl_fast.Eagle3_VLImageProcessorFast",
+    "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
+  },
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": false,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": false,
+  "do_rescale": true,
+  "do_resize": false,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Eagle3_VLImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "processor_class": "Eagle3_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "height": 448,
+    "width": 448
+  }
+}

processing_eagle3_vl.py ADDED Viewed

	@@ -0,0 +1,868 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Eagle3_VL.
+copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llava_onevision/processing_llava_onevision.py
+"""
+import math
+import os
+from typing import Iterable, List, Union, Literal
+import base64
+import sys
+import time
+import warnings
+from functools import lru_cache
+from io import BytesIO
+import re
+import requests
+import torch
+import torchvision
+from packaging import version
+from PIL import Image
+from torchvision import io
+from torchvision import transforms
+from torch.nn import functional as F
+from torchvision.transforms import InterpolationMode
+from typing import Optional, Any
+import numpy as np
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_processing_utils import select_best_resolution
+from transformers.image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+from transformers.models.auto import AutoImageProcessor
+import lmdb
+import cv2
+import pickle
+logger = logging.get_logger(__name__)
+# Highly inspired by https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+FRAME_FACTOR = 2
+FPS = 2.0
+FPS_MIN_FRAMES = 4
+FPS_MAX_FRAMES = 256
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 4096 * 28 * 28
+MAX_RATIO = 200
+IMAGE_MAX_SIZE = 500 * 14
+VIDEO_MIN_PIXELS = 128 * 28 * 28
+VIDEO_MAX_PIXELS = 768 * 28 * 28
+# Set the maximum number of video token inputs.
+# Here, 128K represents the maximum number of input tokens for the VLLM model.
+# Remember to adjust it according to your own configuration.
+VIDEO_TOTAL_PIXELS = int(float(os.environ.get('VIDEO_MAX_PIXELS', 128000 * 28 * 28 * 0.9)))
+logger.info(f"set VIDEO_TOTAL_PIXELS: {VIDEO_TOTAL_PIXELS}")
+def adjust_by_factor(number: int, factor: int, method: Literal['round', 'ceil', 'floor'] = 'round') -> int:
+    """Adjusts 'number' to the nearest, ceiling, or floor multiple of 'factor'."""
+    op = {'round': round, 'ceil': math.ceil, 'floor': math.floor}[method]
+    return op(number / factor) * factor
+def to_rgb(pil_image: Image.Image) -> Image.Image:
+      if pil_image.mode == 'RGBA':
+          white_background = Image.new("RGB", pil_image.size, (255, 255, 255))
+          white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+          return white_background
+      else:
+          return pil_image.convert("RGB")
+def smart_resize(
+    height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS
+) -> tuple[int, int]:
+    """
+    Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = min(max(factor, adjust_by_factor(height, factor, method='round')), IMAGE_MAX_SIZE)
+    w_bar = min(max(factor, adjust_by_factor(width, factor, method='round')), IMAGE_MAX_SIZE)
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((h_bar * w_bar) / max_pixels)
+        h_bar = adjust_by_factor(h_bar / beta, factor, method='floor')
+        w_bar = adjust_by_factor(w_bar / beta, factor, method='floor')
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = adjust_by_factor(height * beta, factor, method='ceil')
+        w_bar = adjust_by_factor(width * beta, factor, method='ceil')
+    return h_bar, w_bar
+def read_img_from_lmdb_v2(image_data):
+    # special case for AgiBotWorld
+    lmdb_file, lmdb_key = image_data['lmdb_file'], image_data['lmdb_key']
+    key = lmdb_key.encode('ascii')
+    env = lmdb.open(lmdb_file, max_readers=10240, readonly=True, lock=False, readahead=False, meminit=False)
+    txn = env.begin()
+    value = txn.get(key)
+    if value is None:
+        print(f"Warning: Key {key} not found.")
+        return None
+    record = pickle.loads(value)
+    image_bgr = cv2.imdecode(np.frombuffer(record['image'], dtype=np.uint8), cv2.IMREAD_COLOR)
+    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+    image = Image.fromarray(image_rgb)
+    return image
+def parse_lmdb_image_data(image_data):
+    lmdb_file = image_data['lmdb_file']
+    if not os.path.exists(lmdb_file):
+        if "/home/zhidingy/workspace/libs/eagle/Eagle2/" in lmdb_file:
+            lmdb_file = lmdb_file.replace("/home/zhidingy/workspace/libs/eagle/Eagle2/", "")
+        else:
+            raise ValueError(f"LMDB file {lmdb_file} does not exist")
+    # special case for AgiBotWorld, will remove it later
+    if 'AgiBotWorld' in image_data['lmdb_file']:
+        return read_img_from_lmdb_v2(image_data)
+    try:
+        env = lmdb.open(image_data['lmdb_file'], readonly=True, lock=False, max_readers=10240)
+    except Exception as e:
+        print(f"Failed to open lmdb file {image_data['lmdb_file']}. Error message: {e}", flush=True)
+        raise e
+    with env.begin(write=False) as txn:
+        try:
+            image_bin = txn.get(image_data['lmdb_key'].encode('ascii'))
+            buf = BytesIO(image_bin)
+        except Exception as e:
+            print(f"Failed to get image from lmdb file {image_data['lmdb_file']}. Error message: {e}", flush=True)
+            raise e
+    try:
+        image = Image.open(buf)
+    except Exception as e:
+        image_np = np.frombuffer(image_bin, dtype=np.uint8)
+        image_bgr = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
+        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray(image_rgb)
+    return image
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = IMAGE_FACTOR) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif isinstance(image, dict) and 'lmdb_file' in image:
+        image_obj = parse_lmdb_image_data(image)
+    elif image.startswith("http://") or image.startswith("https://"):
+        response = requests.get(image, stream=True)
+        image_obj = Image.open(BytesIO(response.content))
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    # if 'scale_factor' in ele:
+    #     scale_factor = ele['scale_factor']
+    #     image = image.resize((image.width * scale_factor, image.height * scale_factor), Image.BILINEAR)
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels", MIN_PIXELS)
+        max_pixels = ele.get("max_pixels", MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+def smart_nframes(
+    ele: dict,
+    total_frames: int,
+    video_fps: int | float,
+) -> int:
+    """calculate the number of frames for video used for model inputs.
+    Args:
+        ele (dict): a dict contains the configuration of video.
+            support either `fps` or `nframes`:
+                - nframes: the number of frames to extract for model inputs.
+                - fps: the fps to extract frames for model inputs.
+                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
+                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
+        total_frames (int): the original total number of frames of the video.
+        video_fps (int | float): the original fps of the video.
+    Raises:
+        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].
+    Returns:
+        int: the number of frames for video used for model inputs.
+    """
+    assert not ("fps" in ele and "nframes" in ele), "Only accept either `fps` or `nframes`"
+    if "nframes" in ele:
+        nframes = adjust_by_factor(ele["nframes"], FRAME_FACTOR, method='round')
+    else:
+        fps = ele.get("fps", FPS)
+        min_frames = adjust_by_factor(ele.get("min_frames", FPS_MIN_FRAMES), FRAME_FACTOR, method='ceil')
+        max_frames = adjust_by_factor(ele.get("max_frames", min(FPS_MAX_FRAMES, total_frames)), FRAME_FACTOR, method='floor')
+        nframes = total_frames / video_fps * fps
+        if nframes > total_frames:
+            logger.warning(f"smart_nframes: nframes[{nframes}] > total_frames[{total_frames}]")
+        nframes = min(min(max(nframes, min_frames), max_frames), total_frames)
+        nframes = adjust_by_factor(nframes, FRAME_FACTOR, method='floor')
+    if not (FRAME_FACTOR <= nframes and nframes <= total_frames):
+        # raise ValueError(f"nframes should in interval [{FRAME_FACTOR}, {total_frames}], but got {nframes}.")
+        nframes = total_frames
+    return nframes
+def _read_video_torchvision(
+    ele: dict,
+) -> (torch.Tensor, float, list):
+    """read video using torchvision.io.read_video and return also per-frame timestamps"""
+    video_path = ele["video"]
+    if version.parse(torchvision.__version__) < version.parse("0.19.0"):
+        if "http://" in video_path or "https://" in video_path:
+            warnings.warn("torchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.")
+        if "file://" in video_path:
+            video_path = video_path[7:]
+    st = time.time()
+    video, audio, info = io.read_video(
+        video_path,
+        start_pts=ele.get("video_start", 0.0),
+        end_pts=ele.get("video_end", None),
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+    total_frames, video_fps = video.size(0), info["video_fps"]
+    logger.info(f"torchvision:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    # Calculate frame indices and corresponding timestamps (based on video start time)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long()
+    start_time = ele.get("video_start", 0.0)
+    timestamps = (start_time + idx.to(torch.float32) / video_fps).tolist()
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    video = video[idx]
+    return video, sample_fps, timestamps
+def is_decord_available() -> bool:
+    import importlib.util
+    return importlib.util.find_spec("decord") is not None
+def _read_video_decord(
+    ele: dict,
+) -> (torch.Tensor, float, list):
+    """read video using decord.VideoReader and return also per-frame timestamps"""
+    import decord
+    video_path = ele["video"]
+    st = time.time()
+    vr = decord.VideoReader(video_path)
+    if 'video_start' in ele or 'video_end' in ele:
+        raise NotImplementedError("not support start_pts and end_pts in decord for now.")
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
+    logger.info(f"decord:  {video_path=}, {total_frames=}, {video_fps=}, time={time.time() - st:.3f}s")
+    nframes = smart_nframes(ele, total_frames=total_frames, video_fps=video_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    start_time = ele.get("video_start", 0.0) # TODO:
+    timestamps = [start_time + i / video_fps for i in idx]
+    video = vr.get_batch(idx).asnumpy()
+    video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+    return video, sample_fps, timestamps
+VIDEO_READER_BACKENDS = {
+    "decord": _read_video_decord,
+    "torchvision": _read_video_torchvision,
+}
+@lru_cache(maxsize=1)
+def get_video_reader_backend() -> str:
+    if is_decord_available():
+        video_reader_backend = "decord"
+    else:
+        video_reader_backend = "torchvision"
+    return video_reader_backend
+def fetch_video(ele: dict, image_factor: int = IMAGE_FACTOR, return_video_sample_fps: bool = False) -> torch.Tensor | list[Image.Image]:
+    if isinstance(ele["video"], str):
+        video_reader_backend = get_video_reader_backend()
+        try:
+            video, sample_fps, timestamps = VIDEO_READER_BACKENDS[video_reader_backend](ele)
+        except Exception as e:
+            logger.warning(f"video_reader_backend {video_reader_backend} error, use torchvision as default, msg: {e}")
+            video, sample_fps, timestamps = VIDEO_READER_BACKENDS["torchvision"](ele)
+        nframes, _, height, width = video.shape
+        min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
+        total_pixels = ele.get("total_pixels", VIDEO_TOTAL_PIXELS)
+        max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
+        max_pixels_supposed = ele.get("max_pixels", max_pixels)
+        if max_pixels_supposed > max_pixels:
+            logger.warning(f"The given max_pixels[{max_pixels_supposed}] exceeds limit[{max_pixels}].")
+        max_pixels = min(max_pixels_supposed, max_pixels)
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=image_factor,
+            )
+        else:
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=image_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
+        video = transforms.functional.resize(
+            video,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()
+        if return_video_sample_fps:
+            return video, sample_fps, timestamps
+        return video
+    else:
+        assert isinstance(ele["video"], (list, tuple))
+        process_info = ele.copy()
+        process_info.pop("type", None)
+        process_info.pop("video", None)
+        images = [
+            fetch_image({"image": video_element, **process_info}, size_factor=image_factor)
+            for video_element in ele["video"]
+        ]
+        nframes = adjust_by_factor(len(images), FRAME_FACTOR, method='ceil')
+        if len(images) < nframes:
+            images.extend([images[-1]] * (nframes - len(images)))
+        timestamps = [-1 for i in range(nframes)] # not sure about this
+        if return_video_sample_fps:
+            return images, process_info.pop("fps", 2.0), timestamps
+        return images
+class Eagle3_VLProcessorKwargs(ProcessingKwargs, total=False):
+    # see processing_utils.ProcessingKwargs documentation for usage.
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {},
+        "videos_kwargs": {},
+    }
+class Eagle3_VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a Eagle3_VL processor which wraps a Eagle3_VL video processor, Eagle3_VL image processor and a Eagle3_VL tokenizer into a single processor.
+    [`Eagle3_VLProcessor`] offers all the functionalities of [`Eagle3_VLVideoProcessor`], [`Eagle3_VLImageProcessor`] and [`Eagle3_VLTokenizer`]. See the
+    [`~Eagle3_VLVideoProcessor.__call__`], [`~Eagle3_VLProcessor.__call__`] and [`~Eagle3_VLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        num_image_tokens (`int`, *optional*):
+            Number of image tokens for one imagethat will be returned by vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "num_image_tokens",
+        "vision_feature_select_strategy",
+        "image_token",
+        "video_token",
+        "images_kwargs",
+        "videos_kwargs",
+        "text_kwargs",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token='<IMG_CONTEXT>',
+        video_token='<IMG_CONTEXT>',
+        pixels_per_token=28*28,
+        image_placeholder='image',
+        video_placeholder='video',
+        image_start_token='<img>',
+        image_end_token='</img>',
+        **kwargs,
+    ):
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        self.image_placeholder = image_placeholder
+        self.video_placeholder = video_placeholder
+        self.pixels_per_token = pixels_per_token
+        self.image_start_token = image_start_token
+        self.image_end_token = image_end_token
+        if 'auto_map' in kwargs:
+            self.auto_map = kwargs['auto_map']
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def replace_media_placeholder(self, text, image_list, video_list, timestamps_list, fps_list, **output_kwargs):
+        num_of_images_in_this_sample = 0
+        num_of_videos_in_this_sample = 0
+        # Regular expression pattern to match formats like <image-1> or <video-2>
+        pattern = re.compile(rf"<({self.image_placeholder}|{self.video_placeholder})-(\d+)>")
+        unified_frame_list = []
+        # Function to replace tags in a single text
+        def replace_in_text(text):
+            # repl callback function for each match replacement operation
+            def repl(match):
+                nonlocal unified_frame_list
+                nonlocal num_of_images_in_this_sample
+                nonlocal num_of_videos_in_this_sample
+                media_type = match.group(1)          # 'image' or 'video'
+                idx_in_list = int(match.group(2)) - 1   # Convert to list index (0-based)
+                # Select the corresponding path based on media type
+                idx_mapper = {0: "first", 1: "second", 2: "third", 3: "fourth", 4: "fifth", 5: "sixth", 6: "seventh", 7: "eighth", 8: "ninth", 9: "tenth"}
+                if media_type == 'image':
+                    image_inputs = self.image_processor(images=[image_list[idx_in_list]], videos=None, **output_kwargs["images_kwargs"])
+                    image_height, image_width = image_inputs['image_sizes'][0]
+                    assert image_height <= IMAGE_MAX_SIZE and image_width <= IMAGE_MAX_SIZE, f"image_height: {image_height}, image_width: {image_width}"
+                    image_tokens = image_height * image_width // self.pixels_per_token
+                    special_placeholder = f"<image {idx_in_list+1}>{self.image_start_token}{self.image_token * image_tokens}{self.image_end_token}"
+                    unified_frame_list.append(image_inputs)
+                    num_of_images_in_this_sample += 1
+                elif media_type == 'video':
+                    video_inputs = self.image_processor(images=None, videos=video_list[idx_in_list], **output_kwargs["videos_kwargs"])
+                    N, C, image_height, image_width = video_inputs['pixel_values'].shape
+                    image_tokens = image_height * image_width // self.pixels_per_token
+                    assert image_height <= IMAGE_MAX_SIZE and image_width <= IMAGE_MAX_SIZE, f"image_height: {image_height}, image_width: {image_width}"
+                    if timestamps_list is not None and -1 not in timestamps_list:
+                        frame_timestamps = timestamps_list[idx_in_list]
+                    else:
+                        frame_timestamps = None
+                    sampled_fps = fps_list[idx_in_list] if fps_list is not None else None
+                    num_of_tokens_list = [image_tokens] * N
+                    if frame_timestamps is not None:
+                        assert len(frame_timestamps) == len(num_of_tokens_list), f"The number of timestamps is not equal to the number of frames: {len(frame_timestamps)} != {len(num_of_tokens_list)}"
+                        special_placeholder = [f"Frame {i+1} sample at {frame_timestamps[i]:.2f}s: {self.image_start_token}{self.image_token * num_of_tokens}{self.image_end_token}" for i, num_of_tokens in enumerate(num_of_tokens_list)]
+                    else:
+                        special_placeholder = [f"Frame {i+1}: {self.image_start_token}{self.image_token * num_of_tokens}{self.image_end_token}" for i, num_of_tokens in enumerate(num_of_tokens_list)]
+                    if sampled_fps is not None:
+                        special_placeholder = f"The {idx_mapper[idx_in_list]} video sampled with {sampled_fps:.2f} fps: " + "".join(special_placeholder)
+                    else:
+                        special_placeholder = f"The {idx_mapper[idx_in_list]} video: " + "".join(special_placeholder)
+                    unified_frame_list.append(video_inputs)
+                    num_of_videos_in_this_sample += 1
+                else:
+                    raise ValueError(f'Unknown media type: {media_type}')
+                return special_placeholder
+            return pattern.sub(repl, text)
+        text = replace_in_text(text)
+        if len(unified_frame_list) > 0:
+            pixel_values = [frame['pixel_values'] for frame in unified_frame_list]
+            image_sizes = torch.cat([frame['image_sizes'] for frame in unified_frame_list], dim=0)
+        else:
+            pixel_values = []
+            image_sizes = []
+        return text, pixel_values, image_sizes, num_of_images_in_this_sample, num_of_videos_in_this_sample
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        audio=None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Eagle3_VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of a video input to be fed to a model. Returned when `videos` is not `None`.
+            - **image_sizes** -- Size of each image that will be used to unpad an image. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Eagle3_VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if isinstance(text, str):
+            text_list = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        elif isinstance(text, list) and isinstance(text[0], str):
+            text_list = text
+        if images is None: images = []
+        if videos is None: videos = []
+        pixel_values_list = []
+        image_sizes_list = []
+        new_sample_list = []
+        image_start_idx = 0
+        video_start_idx = 0
+        timestamps_batch = output_kwargs['videos_kwargs'].pop("timestamps", None)
+        fps_batch = output_kwargs['videos_kwargs'].pop("fps", None)
+        for sample in text_list:
+            timestamps_list = timestamps_batch[video_start_idx:] if timestamps_batch is not None else None
+            fps_list = fps_batch[video_start_idx:] if fps_batch is not None else None
+            sample, pixel_values, image_sizes, num_of_images_in_this_sample, num_of_videos_in_this_sample = self.replace_media_placeholder(sample, images[image_start_idx:], videos[video_start_idx:], timestamps_list, fps_list, **output_kwargs)
+            new_sample_list.append(sample)
+            pixel_values_list.extend(pixel_values)
+            image_sizes_list.extend(image_sizes)
+            image_start_idx += num_of_images_in_this_sample
+            video_start_idx += num_of_videos_in_this_sample
+        if len(pixel_values) > 0:
+            image_inputs = {
+                'pixel_values':pixel_values_list,
+                'image_sizes': torch.stack(image_sizes_list, dim=0)
+            }
+        else:
+            image_inputs = {}
+        video_inputs = {}
+        text_inputs = self.tokenizer(new_sample_list, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    # override to save video-config in a separate config file
+    def save_pretrained(self, save_directory, **kwargs):
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+        os.makedirs(save_directory, exist_ok=True)
+        outputs = super().save_pretrained(save_directory, **kwargs)
+        return outputs
+    # override to load video-config from a separate config file
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        processor = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # if return_unused_kwargs a tuple is returned where the second element is 'unused_kwargs'
+        if isinstance(processor, tuple):
+            processor = processor[0]
+        return processor
+    # Copy from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+    def process_vision_info(
+        self,
+        conversations: list[dict] | list[list[dict]],
+        return_video_kwargs: bool = False,
+    ) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
+        vision_infos = self.extract_vision_info(conversations)
+        ## Read images or videos
+        image_inputs = []
+        video_inputs = []
+        video_sample_fps_list = []
+        video_timestamps_list = []
+        for vision_info in vision_infos:
+            if "image" in vision_info or "image_url" in vision_info:
+                image_inputs.append(fetch_image(vision_info))
+            elif "video" in vision_info:
+                video_input, video_sample_fps, video_timestamps = fetch_video(vision_info, return_video_sample_fps=True)
+                video_sample_fps_list.append(video_sample_fps)
+                video_inputs.append(video_input)
+                video_timestamps_list.append(video_timestamps)
+            else:
+                raise ValueError("image, image_url or video should in content.")
+        if len(image_inputs) == 0:
+            image_inputs = None
+        if len(video_inputs) == 0:
+            video_inputs = None
+        if return_video_kwargs:
+            return image_inputs, video_inputs, {'fps': video_sample_fps_list, 'timestamps': video_timestamps_list}
+        return image_inputs, video_inputs
+    def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]:
+        vision_infos = []
+        if isinstance(conversations[0], dict):
+            conversations = [conversations]
+        for conversation in conversations:
+            for message in conversation:
+                if isinstance(message["content"], list):
+                    for ele in message["content"]:
+                        if (
+                            "image" in ele
+                            or "image_url" in ele
+                            or "video" in ele
+                            or ele["type"] in ("image", "image_url", "video")
+                        ):
+                            vision_infos.append(ele)
+        return vision_infos
+    def py_apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
+        """
+        Renders a chat conversation using a custom template with verification of tokens.
+        The purpose is to check for the existence of tokens like "<image-1>" or "<video-1>"
+        in the message text and skip adding them if they already exist.
+        Args:
+            messages (list): A list of message dictionaries. Each message should contain:
+                - 'role': The role of the speaker (e.g., 'system', 'user', 'assistant').
+                - 'content': Either a string or a list of content blocks. In the list each block may contain:
+                      * 'type': The type of content, such as 'image' or 'video'.
+                      * 'text': The actual text if present.
+                      * Other keys such as 'image', 'image_url', or 'video'.
+            add_generation_prompt (bool): If True, appends "<|im_start|>assistant" at the end of the rendered string.
+            tokenize (bool): If True, tokenize the rendered string.
+        Returns:
+            str: The final rendered chat string according to the specified template.
+        """
+        assert tokenize == False, "tokenize is not supported yet"
+        result = ""
+        image_count = 0
+        video_count = 0
+        message_text = ""
+        for idx, message in enumerate(messages):
+            if message.get('role') != 'user': continue
+            # If content is a string, simply output it.
+            content = message.get('content')
+            if isinstance(content, str):
+                message_text += content
+            elif isinstance(content, list):
+                # Process each content item.
+                for item in content:
+                    # If the block is a dictionary and contains text, add it to message_text.
+                    if isinstance(item, dict) and "text" in item:
+                        message_text += item["text"]
+                    # If an item is already a string in the list, add it directly.
+                    elif isinstance(item, str):
+                        message_text += item
+        for idx, message in enumerate(messages):
+            # If the first message is not from the system, prepend a default system message.
+            if idx == 0 and message.get('role') != 'system':
+                result += "<|im_start|>system\n"
+                result += "You are a helpful assistant.\n"
+                result += "<|im_end|>\n"
+            # Start the current message block with its role.
+            result += f"<|im_start|>{message.get('role', '')}\n"
+            content = message.get('content')
+            # If content is a string, simply output it.
+            if isinstance(content, str):
+                result += content
+                result += "<|im_end|>\n"
+            else:
+                # Process each content item.
+                for item in content:
+                    # Check if the item is an image (explicitly by type or by key presence).
+                    if (isinstance(item, dict) and (item.get('type') == 'image' or 'image' in item or 'image_url' in item)):
+                        image_count += 1
+                        candidate_token = f"<image-{image_count}>"
+                        # Only add the token if it is not already present in the collected text.
+                        if candidate_token not in message_text:
+                            result += candidate_token
+                    # Check if the item is a video.
+                    elif (isinstance(item, dict) and (item.get('type') == 'video' or 'video' in item)):
+                        video_count += 1
+                        candidate_token = f"<video-{video_count}>"
+                        # Only add the token if it is not already present.
+                        if candidate_token not in message_text:
+                            result += candidate_token
+                    # If the item contains text, add it.
+                    elif isinstance(item, dict) and 'text' in item:
+                        result += item['text']
+                    # If the item is a string (and not handled already), add it.
+                    elif isinstance(item, str):
+                        result += item
+                result += "<|im_end|>\n"
+        # Optionally add assistant generation prompt at the end.
+        if add_generation_prompt:
+            result += "<|im_start|>assistant\n"
+        return result
+    @classmethod
+    def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters.
+        Args:
+            processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~processing_utils.ProcessingMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the processor object.
+        Returns:
+            [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those
+            parameters.
+        """
+        processor_dict = processor_dict.copy()
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs
+        # If we don't pop, some specific kwargs will raise a warning
+        if "processor_class" in processor_dict:
+            del processor_dict["processor_class"]
+        #if "auto_map" in processor_dict:
+        #    del processor_dict["auto_map"]
+        unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs)
+        processor = cls(*args, **processor_dict)
+        # Update processor with kwargs if needed
+        for key in set(kwargs.keys()):
+            if hasattr(processor, key):
+                setattr(processor, key, kwargs.pop(key))
+        kwargs.update(unused_kwargs)
+        logger.info(f"Processor {processor}")
+        if return_unused_kwargs:
+            return processor, kwargs
+        else:
+            return processor
+__all__ = ["Eagle3_VLProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
+  },
+  "image_end_token": "</img>",
+  "image_placeholder": "image",
+  "image_start_token": "<img>",
+  "image_token": "<IMG_CONTEXT>",
+  "pixels_per_token": 784,
+  "processor_class": "Eagle3_VLProcessor",
+  "video_placeholder": "video",
+  "video_token": "<IMG_CONTEXT>",
+  "vision_feature_select_strategy": null
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<box>",
+    "</box>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<interval>",
+    "</interval>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,344 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<interval>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "</interval>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<IMG_CONTEXT>",
+    "<img>",
+    "</img>",
+    "<box>",
+    "</box>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<interval>",
+    "</interval>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "processing_eagle3_vl.Eagle3_VLProcessor"
+  },
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0].role == 'system' %}\n        {{- messages[0].content + '\\n\\n' }}\n    {%- endif %}\n    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0].role == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n    {%- set index = (messages|length - 1) - loop.index0 %}\n    {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n        {%- set ns.multi_step_tool = false %}\n        {%- set ns.last_query_index = index %}\n    {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n    {%- if message.content is string %}\n        {%- set content = message.content %}\n    {%- else %}\n        {%- set content = '' %}\n    {%- endif %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n        {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {%- set reasoning_content = '' %}\n        {%- if message.reasoning_content is string %}\n            {%- set reasoning_content = message.reasoning_content %}\n        {%- else %}\n            {%- if '</think>' in content %}\n                {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n                {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n            {%- endif %}\n        {%- endif %}\n        {%- if loop.index0 > ns.last_query_index %}\n            {%- if loop.last or (not loop.last and reasoning_content) %}\n                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n            {%- else %}\n                {{- '<|im_start|>' + message.role + '\\n' + content }}\n            {%- endif %}\n        {%- else %}\n            {{- '<|im_start|>' + message.role + '\\n' + content }}\n        {%- endif %}\n        {%- if message.tool_calls %}\n            {%- for tool_call in message.tool_calls %}\n                {%- if (loop.first and content) or (not loop.first) %}\n                    {{- '\\n' }}\n                {%- endif %}\n                {%- if tool_call.function %}\n                    {%- set tool_call = tool_call.function %}\n                {%- endif %}\n                {{- '<tool_call>\\n{\"name\": \"' }}\n                {{- tool_call.name }}\n                {{- '\", \"arguments\": ' }}\n                {%- if tool_call.arguments is string %}\n                    {{- tool_call.arguments }}\n                {%- else %}\n                    {{- tool_call.arguments | tojson }}\n                {%- endif %}\n                {{- '}\\n</tool_call>' }}\n            {%- endfor %}\n        {%- endif %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n    {%- if enable_thinking is defined and enable_thinking is false %}\n        {{- '<think>\\n\\n</think>\\n\\n' }}\n    {%- endif %}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Eagle3_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.0,
+    "total_flos": 5.461479216500315e+20,
+    "train_loss": 0.09150631395949999,
+    "train_runtime": 9502.0046,
+    "train_samples": 166521,
+    "train_samples_per_second": 17.525,
+    "train_steps_per_second": 0.617
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ada0e77f0909ed3123bc44aa18a7b80ab583a478c366421ce26ab09d2944f6d5
+size 7160

training_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:599fce10530a172251643e7a65d4374016355261a8b2d32d4b2712fd20da674a
+size 203679795

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff