| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import math |
| import warnings |
| from dataclasses import dataclass |
| from typing import Any, Callable, Optional, Tuple, Union, List |
|
|
| import numpy as np |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss |
| from torch.nn.init import _calculate_fan_in_and_fan_out |
|
|
| from transformers.activations import ACT2FN |
| from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask |
| from transformers.configuration_utils import PretrainedConfig |
|
|
| from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput |
| from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel |
| from transformers.utils import ( |
| ModelOutput, |
| add_start_docstrings, |
| add_start_docstrings_to_model_forward, |
| can_return_tuple, |
| logging, |
| replace_return_docstrings, |
| ) |
| from transformers.models.siglip2.configuration_siglip2 import Siglip2Config, Siglip2TextConfig |
| from collections import defaultdict |
| from itertools import accumulate |
| from math import isqrt |
| from typing import Dict |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| import inspect |
| import os |
| from typing import Optional, Tuple |
|
|
| import torch |
| import torch.nn.functional as F |
|
|
| from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal, logging |
| from transformers.integrations.flash_attention import flash_attention_forward as original_flash_attention_forward |
| flash_241 = is_flash_attn_greater_or_equal("2.4.1") |
| deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" |
|
|
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| if is_flash_attn_2_available(): |
| from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input |
| from flash_attn import flash_attn_func, flash_attn_varlen_func, flash_attn_varlen_qkvpacked_func |
|
|
| _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) |
|
|
|
|
| def _flash_attention_forward( |
| query_states: torch.Tensor, |
| key_states: torch.Tensor, |
| value_states: torch.Tensor, |
| attention_mask: torch.Tensor, |
| query_length: int, |
| is_causal: bool, |
| dropout: float = 0.0, |
| position_ids: Optional[torch.Tensor] = None, |
| softmax_scale: Optional[float] = None, |
| sliding_window: Optional[int] = None, |
| use_top_left_mask: bool = False, |
| softcap: Optional[float] = None, |
| deterministic: bool = None, |
| cu_seq_lens_q: Optional[torch.LongTensor] = None, |
| cu_seq_lens_k: Optional[torch.LongTensor] = None, |
| max_length_q: Optional[int] = None, |
| max_length_k: Optional[int] = None, |
| target_dtype: Optional[torch.dtype] = None, |
| **kwargs, |
| ): |
| """ |
| Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token |
| first unpad the input, then computes the attention scores and pad the final attention scores. |
| Args: |
| query_states (`torch.Tensor`): |
| Input query states to be passed to Flash Attention API |
| key_states (`torch.Tensor`): |
| Input key states to be passed to Flash Attention API |
| value_states (`torch.Tensor`): |
| Input value states to be passed to Flash Attention API |
| attention_mask (`torch.Tensor`): |
| The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the |
| position of padding tokens and 1 for the position of non-padding tokens. |
| dropout (`int`, *optional*): |
| Attention dropout |
| softmax_scale (`float`, *optional*): |
| The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) |
| use_sliding_windows (`bool`, *optional*): |
| Whether to activate sliding window attention. |
| """ |
| |
| if not use_top_left_mask: |
| causal = is_causal |
| else: |
| |
| causal = is_causal and query_length != 1 |
| |
| use_sliding_windows = ( |
| _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window |
| ) |
| flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} |
| if flash_241: |
| if deterministic is None: |
| deterministic = deterministic_g |
| flash_kwargs["deterministic"] = deterministic |
|
|
| if softcap is not None: |
| flash_kwargs["softcap"] = softcap |
|
|
| attn_output = flash_attn_varlen_func( |
| query_states[0], |
| key_states[0], |
| value_states[0], |
| cu_seqlens_q=cu_seq_lens_q, |
| cu_seqlens_k=cu_seq_lens_k, |
| max_seqlen_q=max_length_q, |
| max_seqlen_k=max_length_k, |
| dropout_p=dropout, |
| softmax_scale=softmax_scale, |
| causal=causal, |
| **flash_kwargs, |
| ) |
|
|
|
|
| return attn_output |
|
|
|
|
| from transformers.utils import is_flash_attn_greater_or_equal_2_10 |
| _use_top_left_mask = not is_flash_attn_greater_or_equal_2_10() |
|
|
| def flash_attention_forward_for_packing( |
| module: torch.nn.Module, |
| query: torch.Tensor, |
| key: torch.Tensor, |
| value: torch.Tensor, |
| attention_mask: Optional[torch.Tensor]=None, |
| dropout: float = 0.0, |
| scaling: Optional[float] = None, |
| sliding_window: Optional[int] = None, |
| softcap: Optional[float] = None, |
| seq_len_list: Optional[List[int]] = None, |
| **kwargs, |
| ) -> Tuple[torch.Tensor, None]: |
| |
| |
| seq_len = query.shape[2] |
|
|
| |
| query = query.transpose(1, 2) |
| key = key.transpose(1, 2) |
| value = value.transpose(1, 2) |
|
|
| |
| |
| |
| |
| |
| target_dtype = None |
| if query.dtype == torch.float32: |
| if torch.is_autocast_enabled(): |
| target_dtype = torch.get_autocast_gpu_dtype() |
| |
| elif hasattr(module.config, "_pre_quantization_dtype"): |
| target_dtype = module.config._pre_quantization_dtype |
| else: |
| target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype |
|
|
| |
| kwargs.pop("is_causal", None) |
| |
|
|
| cu_seqlens = F.pad(torch.cumsum(torch.tensor(seq_len_list, device=query.device, dtype=torch.int32), dim=0), (1, 0)) |
| cu_seqlens = cu_seqlens.to(torch.int32) |
| max_seq_len = max(seq_len_list) |
| attn_output = _flash_attention_forward( |
| query, |
| key, |
| value, |
| attention_mask, |
| query_length=seq_len, |
| is_causal=module.is_causal, |
| dropout=dropout, |
| softmax_scale=scaling, |
| sliding_window=sliding_window, |
| softcap=softcap, |
| use_top_left_mask=_use_top_left_mask, |
| target_dtype=target_dtype, |
| cu_seq_lens_q=cu_seqlens, |
| cu_seq_lens_k=cu_seqlens, |
| max_length_q=max_seq_len, |
| max_length_k=max_seq_len, |
| **kwargs, |
| ) |
| return attn_output.squeeze(0), None |
|
|
| |
| _CONFIG_FOR_DOC = "Siglip2Config" |
|
|
|
|
|
|
| class Siglip2VisionConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a |
| Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a |
| configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2 |
| [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| hidden_size (`int`, *optional*, defaults to 768): |
| Dimensionality of the encoder layers and the pooler layer. |
| intermediate_size (`int`, *optional*, defaults to 3072): |
| Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. |
| num_hidden_layers (`int`, *optional*, defaults to 12): |
| Number of hidden layers in the Transformer encoder. |
| num_attention_heads (`int`, *optional*, defaults to 12): |
| Number of attention heads for each attention layer in the Transformer encoder. |
| num_channels (`int`, *optional*, defaults to 3): |
| Number of channels in the input images. |
| num_patches (`int`, *optional*, defaults to 256): |
| The number of patches in the image with the size of (`patch_size`, `patch_size`). |
| The image is resized to fill maximum of this number of patches, and to preserve |
| the aspect ratio. In case the resulted number of patches is lower, the image is |
| padded in "patch" dimension. |
| patch_size (`int`, *optional*, defaults to 16): |
| The size (resolution) of each patch. |
| hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): |
| The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, |
| `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. |
| layer_norm_eps (`float`, *optional*, defaults to 1e-06): |
| The epsilon used by the layer normalization layers. |
| attention_dropout (`float`, *optional*, defaults to 0.0): |
| The dropout ratio for the attention probabilities. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel |
| |
| >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration |
| >>> configuration = Siglip2VisionConfig() |
| |
| >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration |
| >>> model = Siglip2VisionModel(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
|
|
| model_type = "siglip2_vision_model" |
| base_config_key = "vision_config" |
|
|
| def __init__( |
| self, |
| hidden_size=1152, |
| intermediate_size=4304, |
| num_hidden_layers=27, |
| num_attention_heads=16, |
| num_channels=3, |
| num_patches=256, |
| patch_size=14, |
| hidden_act="gelu_pytorch_tanh", |
| layer_norm_eps=1e-6, |
| attention_dropout=0.0, |
| window_size=14, |
| full_attention_indexes=[7, 14, 21, 26], |
| use_rope=True, |
| use_windows_attn=True, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.num_channels = num_channels |
| self.patch_size = patch_size |
| self.attention_dropout = attention_dropout |
| self.layer_norm_eps = layer_norm_eps |
| self.hidden_act = hidden_act |
| self.num_patches = num_patches |
| self.window_size = window_size |
| self.full_attention_indexes = full_attention_indexes |
| self.use_windows_attn = use_windows_attn |
| self.use_rope = use_rope |
|
|
|
|
| @dataclass |
| class Siglip2VisionOutput(ModelOutput): |
| """ |
| Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. |
| |
| Args: |
| image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): |
| The image embeddings obtained by applying the projection layer to the pooler_output. |
| last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
| Sequence of hidden-states at the output of the last layer of the model. |
| hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
| Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
| one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
| |
| Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
| attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
| Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
| sequence_length)`. |
| |
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
| heads. |
| """ |
|
|
| image_embeds: Optional[torch.FloatTensor] = None |
| last_hidden_state: Optional[torch.FloatTensor] = None |
| hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None |
| attentions: Optional[Tuple[torch.FloatTensor, ...]] = None |
| spatial_shapes: Optional[torch.LongTensor] = None |
|
|
|
|
| def convert_image_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.Tensor": |
| """ |
| Convert 3D tensor image of shape (num_channels, image_height, image_width) into 2D tensor of patches of shape |
| (num_patches_height * num_patches_width, patch_size * patch_size * num_channels). |
| """ |
| num_channels, image_height, image_width = image.shape |
| num_patches_height = image_height // patch_size |
| num_patches_width = image_width // patch_size |
| patched_image = image.reshape(num_channels, num_patches_height, patch_size, num_patches_width, patch_size) |
| patched_image = patched_image.permute(1, 3, 2, 4, 0) |
| patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1) |
| return patched_image |
|
|
| def convert_images_to_patches(image: "torch.Tensor", patch_size: int) -> "torch.Tensor": |
| """ |
| Convert 4D tensor image of shape (batch_size, num_channels, image_height, image_width) into 2D tensor of patches of shape |
| (batch_size, num_patches_height * num_patches_width, patch_size * patch_size * num_channels). |
| """ |
| batch_size, num_channels, image_height, image_width = image.shape |
| assert image_height % patch_size == 0 and image_width % patch_size == 0, f"image_height % patch_size == 0 and image_width % patch_size == 0" |
| num_patches_height = image_height // patch_size |
| num_patches_width = image_width // patch_size |
| patched_image = image.reshape(batch_size, num_channels, num_patches_height, patch_size, num_patches_width, patch_size) |
| patched_image = patched_image.permute(0, 2, 4, 3, 5, 1) |
| |
| patched_image = patched_image.reshape(batch_size * num_patches_height * num_patches_width, -1) |
| return patched_image |
|
|
|
|
| class Siglip2VisionEmbeddings(nn.Module): |
| def __init__(self, config: Siglip2VisionConfig): |
| super().__init__() |
| self.config = config |
| self.embed_dim = config.hidden_size |
| self.patch_size = config.patch_size |
| self.window_size = config.window_size |
|
|
| self.patch_embedding = nn.Linear( |
| in_features=config.num_channels * self.patch_size * self.patch_size, |
| out_features=self.embed_dim, |
| ) |
|
|
| self.num_patches = config.num_patches |
| self.position_embedding_size = int(self.num_patches**0.5) |
| self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim) |
|
|
|
|
| def split_patch_embeddings_to_windows_with_meta(self, patch_embeds, batch_hw, window_size): |
| """ |
| Args: |
| patch_embeds: Tensor, shape (1, sum(H_i*W_i), C) |
| batch_hw: List[(H_i, W_i)] |
| window_size: int |
| |
| Returns: |
| windows_tensor: Tensor, shape (total_windows, window_size*window_size, C) |
| win_meta_list: List[dict] with keys: |
| - img_idx: index in batch_hw |
| - patch_hw: original (H, W) |
| - win_xy: (h0, w0) 左上角相对于原图 |
| - win_hw: 原图内有效窗口大小 (h_eff, w_eff) |
| """ |
|
|
| |
| batch_hw = batch_hw.tolist() |
| counts = [H * W for (H, W) in batch_hw] |
| starts = [0] + list(accumulate(counts))[:-1] |
|
|
| |
| size2info = defaultdict(list) |
| for img_idx, ((H, W), start) in enumerate(zip(batch_hw, starts)): |
| size2info[(H, W)].append((img_idx, start)) |
|
|
| all_windows = [] |
| all_meta = [] |
| |
| |
| for (H, W), info in size2info.items(): |
| H, W = int(H), int(W) |
| B = len(info) |
| C = patch_embeds.shape[-1] |
| img_idxs, img_starts = zip(*info) |
|
|
| |
| imgs = [] |
| for st in img_starts: |
| flat = patch_embeds[0, st: st + H * W] |
| imgs.append(flat.transpose(0,1).reshape(C, H, W)) |
| batch_tensor = torch.stack(imgs, dim=0) |
| |
| pad_h = (window_size - H % window_size) % window_size |
| pad_w = (window_size - W % window_size) % window_size |
|
|
| |
| batch_padded = F.pad(batch_tensor, (0, pad_w, 0, pad_h)) |
|
|
| H_pad, W_pad = H + pad_h, W + pad_w |
| n_h = H_pad // window_size |
| n_w = W_pad // window_size |
| n_windows = n_h * n_w |
|
|
| |
| patches_unf = F.unfold( |
| batch_padded, |
| kernel_size=(window_size, window_size), |
| stride=(window_size, window_size) |
| ) |
|
|
| |
| patches = ( |
| patches_unf |
| .view(B, C, window_size * window_size, n_windows) |
| .permute(0, 3, 2, 1) |
| .reshape(-1, window_size * window_size, C) |
| ) |
| all_windows.append(patches) |
|
|
| |
| for b, img_idx in enumerate(img_idxs): |
| for win_id in range(n_windows): |
| i, j = divmod(win_id, n_w) |
| h0, w0 = i * window_size, j * window_size |
| |
| h1 = min(h0 + window_size, H) |
| w1 = min(w0 + window_size, W) |
| all_meta.append({ |
| 'img_idx': img_idx, |
| 'patch_hw': (H, W), |
| 'win_xy': (h0, w0), |
| 'win_hw': (h1 - h0, w1 - w0), |
| }) |
|
|
| |
| sorted_idx = sorted( |
| range(len(all_meta)), |
| key=lambda k: ( |
| all_meta[k]['img_idx'], |
| all_meta[k]['win_xy'][0], |
| all_meta[k]['win_xy'][1] |
| ) |
| ) |
| all_windows = torch.cat(all_windows, dim=0) |
| all_windows = all_windows[sorted_idx] |
| win_meta_list = [all_meta[i] for i in sorted_idx] |
|
|
| windows_list = [] |
| for meta, win in zip(win_meta_list, all_windows): |
| h_eff, w_eff = meta['win_hw'] |
| valid_num = h_eff * w_eff |
| |
| if valid_num == window_size * window_size: |
| windows_list.append(win) |
| else: |
| win = win.view(window_size, window_size, -1)[:h_eff, :w_eff, :].reshape(h_eff * w_eff, -1) |
| windows_list.append(win) |
|
|
| |
| all_tokens = torch.cat(windows_list, dim=0).unsqueeze(0) |
| |
| |
| |
| counts = [H * W for H, W in batch_hw] |
| starts = [0] + list(accumulate(counts))[:-1] |
| total_patches = sum(counts) |
|
|
| |
| mapping = [None] * total_patches |
| offset = 0 |
|
|
| for meta in win_meta_list: |
| img_idx = meta['img_idx'] |
| H, W = meta['patch_hw'] |
| h0, w0 = meta['win_xy'] |
| h_eff, w_eff = meta['win_hw'] |
| base = starts[img_idx] |
|
|
| |
| for u in range(h_eff): |
| for v in range(w_eff): |
| |
| orig_idx = base + (h0+u) * W + (w0) + v |
| |
| p = u * w_eff + v |
| mapping[orig_idx] = offset + p |
|
|
| |
| offset += h_eff * w_eff |
| reverse_mapping = torch.tensor(mapping, dtype=torch.long) |
|
|
| return all_tokens, win_meta_list, reverse_mapping |
|
|
| @staticmethod |
| def resize_positional_embeddings( |
| positional_embeddings: torch.Tensor, |
| spatial_shapes: torch.LongTensor, |
| ) -> torch.Tensor: |
| """ |
| Resize positional embeddings to image-specific size and pad to a fixed size. |
| |
| Args: |
| positional_embeddings (`torch.Tensor`): |
| Position embeddings of shape (height, width, embed_dim) |
| spatial_shapes (`torch.LongTensor`): |
| Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to |
| max_length (`int`): |
| Maximum length of the positional embeddings to pad resized positional embeddings to |
| |
| Returns: |
| `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim) |
| """ |
| batch_size = spatial_shapes.shape[0] |
| embed_dim = positional_embeddings.shape[-1] |
| source_dtype = positional_embeddings.dtype |
|
|
| resulted_positional_embeddings = [] |
|
|
| |
| positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0) |
|
|
| |
| if positional_embeddings.device.type == "cpu": |
| positional_embeddings = positional_embeddings.to(torch.float32) |
|
|
| for i in range(batch_size): |
| |
| height, width = spatial_shapes[i] |
| resized_embeddings = F.interpolate( |
| positional_embeddings, |
| size=(height, width), |
| mode="bilinear", |
| align_corners=False, |
| antialias=True, |
| ) |
|
|
| |
| resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1) |
|
|
| |
| resized_embeddings = resized_embeddings.to(source_dtype) |
|
|
| resulted_positional_embeddings.append(resized_embeddings) |
|
|
| return torch.cat(resulted_positional_embeddings, dim=0).unsqueeze(0) |
|
|
| def get_spatial_shapes(self, bchw_list: List[torch.Tensor]) -> torch.Tensor: |
| hw_list = [] |
| for shape in bchw_list: |
| b, _, h, w = shape |
| hw_list.extend([(h//self.patch_size, w//self.patch_size)] * b) |
| hw_tensor = torch.tensor(hw_list) |
| return hw_tensor |
|
|
| def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: |
| """ |
| Args: |
| pixel_values (`torch.FloatTensor`): |
| Pixel values of shape (batch_size, num_channels, height, width) |
| """ |
|
|
| bchw_list = [each.shape for each in pixel_values] |
| |
| pixel_values = torch.cat([convert_images_to_patches(each, self.patch_size) for each in pixel_values], dim=0) |
|
|
| |
| target_dtype = self.patch_embedding.weight.dtype |
| patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) |
|
|
| |
| positional_embeddings = self.position_embedding.weight.reshape( |
| self.position_embedding_size, self.position_embedding_size, -1 |
| ) |
| spatial_shapes = self.get_spatial_shapes(bchw_list) |
| |
| resized_positional_embeddings = self.resize_positional_embeddings( |
| positional_embeddings, spatial_shapes |
| ) |
| |
| embeddings = patch_embeds + resized_positional_embeddings |
|
|
| windows_tensor, win_meta_list, reverse_mapping = self.split_patch_embeddings_to_windows_with_meta(embeddings, spatial_shapes, self.window_size) |
| |
| return windows_tensor, win_meta_list, spatial_shapes, reverse_mapping |
|
|
|
|
| class Rope2DPosEmb(nn.Module): |
| |
| """ |
| copy from https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking/blob/main/modeling_kimi_vl.py#L324 |
| 2D rotary position embedding with multi-resolution support. |
| This class is intended to be used in the following way: |
| 1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis. |
| 2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration. |
| 3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation. |
| The rope is shared across all attention layers and all heads. |
| Refs: |
| - RoFormer: https://arxiv.org/abs/2104.09864 |
| - VisionLLaMA: https://arxiv.org/abs/2403.00522 |
| - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py |
| Args: |
| dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed) |
| max_height (int): the maximum height of the 2D grid |
| max_width (int): the maximum width of the 2D grid |
| theta_base (float): the base of the theta |
| device (str): the device to store the precomputed cis |
| """ |
|
|
| def __init__(self, dim: int, max_height: int, max_width: int, theta_base=10000, window_size=14): |
| super().__init__() |
| self.dim = dim |
| assert self.dim % 4 == 0, "dim must be divisible by 4" |
| self.max_height = max_height |
| self.max_width = max_width |
| self.theta_base = theta_base |
| self.window_size = window_size |
|
|
| self.freqs_cis = None |
|
|
| def extra_repr(self): |
| return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}" |
|
|
| def _precompute_freqs_cis(self, device: torch.device) -> torch.Tensor: |
| """Calculate the cis(freqs) for each position in the 2D grid. |
| Return: complex tensor of shape (max_height, max_width, dim//2) and value: |
| height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim)) |
| weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim)) with (i in [0, dim//4)) |
| note: `cis` is a mathematical notation defined by cis x = cos x + i sin x, |
| """ |
| N = self.max_height * self.max_width |
| flat_pos = torch.arange(0, N).float().to(device) |
| x_pos = flat_pos % self.max_width |
| y_pos = flat_pos // self.max_width |
| dim_range = ( |
| torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(device) |
| ) |
| freqs = 1.0 / (self.theta_base ** (dim_range / self.dim)) |
| x_freqs = torch.outer(x_pos, freqs).float() |
| y_freqs = torch.outer(y_pos, freqs).float() |
| x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs) |
| y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs) |
| |
| freqs_cis = torch.cat( |
| [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1 |
| ) |
| |
| freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1) |
| return freqs_cis |
|
|
| def get_freqs_cis(self, win_meta_list: List[Dict], device: torch.device) -> torch.Tensor: |
| """ |
| Args: |
| win_meta_list (List[Dict]): window meta list |
| Returns: |
| freqs_cis: tensor of shape (sum(t * height * width), dim//2) |
| """ |
| if self.freqs_cis is None: |
| self.freqs_cis = self._precompute_freqs_cis(device) |
| |
| |
| assert all(win_meta['win_xy'][0] + win_meta['win_hw'][0] < 512 and win_meta['win_xy'][1] + win_meta['win_hw'][1] < 512 for win_meta in win_meta_list) |
| freqs_cis = torch.cat([self.freqs_cis[win_meta['win_xy'][0]:win_meta['win_xy'][0] + win_meta['win_hw'][0], win_meta['win_xy'][1]: win_meta['win_xy'][1] + win_meta['win_hw'][1]].reshape(-1, self.dim // 2) for win_meta in win_meta_list], dim=0) |
| freqs_cis = freqs_cis.unsqueeze(0) |
| return freqs_cis |
| |
|
|
| def eager_attention_forward( |
| module: nn.Module, |
| query: torch.Tensor, |
| key: torch.Tensor, |
| value: torch.Tensor, |
| attention_mask: Optional[torch.Tensor], |
| scaling: float, |
| dropout: float = 0.0, |
| **kwargs, |
| ): |
| attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling |
| if attention_mask is not None: |
| attn_weights = attn_weights + attention_mask |
|
|
| attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) |
| attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) |
|
|
| attn_output = torch.matmul(attn_weights, value) |
| attn_output = attn_output.transpose(1, 2).contiguous() |
|
|
| return attn_output, attn_weights |
|
|
|
|
|
|
| def _apply_rope_input_validation(x, freqs_cis): |
| assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape) |
| assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape) |
| assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape) |
| assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype |
|
|
|
|
| def apply_rope( |
| xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor |
| ) -> tuple[torch.Tensor, torch.Tensor]: |
| """ |
| Args: (The leading dimensions of all inputs should be the same) |
| xq: query, tensor of shape (..., num_heads, head_dim) |
| xk: key, tensor of shape (..., num_heads, head_dim) |
| freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid. |
| Returns: |
| xq_out, xk_out: tensors of shape (..., num_heads, head_dim) |
| """ |
| _apply_rope_input_validation(xq, freqs_cis) |
| _apply_rope_input_validation(xk, freqs_cis) |
|
|
| freqs_cis = freqs_cis.unsqueeze(-2) |
| |
| xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2)) |
| xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2)) |
| xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2) |
| xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2) |
| return xq_out.type_as(xq), xk_out.type_as(xk) |
|
|
|
|
| class Siglip2Attention(nn.Module): |
| """Multi-headed attention from 'Attention Is All You Need' paper""" |
|
|
| def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]): |
| super().__init__() |
| self.config = config |
| self.embed_dim = config.hidden_size |
| self.num_heads = config.num_attention_heads |
| self.head_dim = self.embed_dim // self.num_heads |
| if self.head_dim * self.num_heads != self.embed_dim: |
| raise ValueError( |
| f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" |
| f" {self.num_heads})." |
| ) |
| self.scale = self.head_dim**-0.5 |
| self.dropout = config.attention_dropout |
| self.is_causal = False |
|
|
| self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) |
| self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) |
| self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) |
| self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) |
|
|
| self.use_windows_attn = config.use_windows_attn |
| self.use_rope = config.use_rope |
|
|
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| output_attentions: Optional[bool] = False, |
| rope_freqs_cis: Optional[torch.Tensor] = None, |
| win_meta_list: Optional[List[Dict]] = None, |
| windows_attn: Optional[bool] = False, |
| ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: |
| """Input shape: Batch x Time x Channel""" |
|
|
|
|
| batch_size, seq_length, embed_dim = hidden_states.shape |
|
|
| queries = self.q_proj(hidden_states) |
| keys = self.k_proj(hidden_states) |
| values = self.v_proj(hidden_states) |
|
|
|
|
| queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim) |
| keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim) |
| values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) |
|
|
| if self.use_rope: |
| queries, keys = apply_rope(queries, keys, rope_freqs_cis) |
|
|
| queries = queries.transpose(1, 2) |
| keys = keys.transpose(1, 2) |
|
|
| attention_interface: Callable = eager_attention_forward |
| if self.config._attn_implementation != "eager": |
| if self.config._attn_implementation == "sdpa" and output_attentions: |
| logger.warning_once( |
| "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " |
| 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' |
| ) |
| if self.config._attn_implementation == "flash_attention_2": |
| from transformers.modeling_utils import AttentionInterface |
| AttentionInterface._global_mapping['flash_attention_2_packing'] = flash_attention_forward_for_packing |
| setattr(AttentionInterface, 'flash_attention_2_packing', flash_attention_forward_for_packing) |
| attention_interface = ALL_ATTENTION_FUNCTIONS['flash_attention_2_packing'] |
| else: |
| attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] |
|
|
| if windows_attn and self.use_windows_attn: |
| seq_len_list = [win_meta['win_hw'][0] * win_meta['win_hw'][1] for win_meta in win_meta_list] |
| else: |
| mapper = defaultdict(lambda: 0) |
| for win_meta in win_meta_list: |
| mapper[win_meta['img_idx']] += win_meta['win_hw'][0] * win_meta['win_hw'][1] |
| seq_len_list = [mapper[i] for i in range(len(mapper))] |
|
|
| attention_mask = None |
| attn_output, attn_weights = attention_interface( |
| self, |
| queries, |
| keys, |
| values, |
| attention_mask, |
| is_causal=self.is_causal, |
| scaling=self.scale, |
| dropout=0.0 if not self.training else self.dropout, |
| seq_len_list=seq_len_list, |
| ) |
| attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() |
| attn_output = self.out_proj(attn_output) |
|
|
| if not output_attentions: |
| attn_weights = None |
|
|
| return attn_output, attn_weights |
|
|
|
|
| class Siglip2MLP(nn.Module): |
| def __init__(self, config): |
| super().__init__() |
| self.config = config |
| self.activation_fn = ACT2FN[config.hidden_act] |
| self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) |
| self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) |
|
|
| def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: |
| hidden_states = self.fc1(hidden_states) |
| hidden_states = self.activation_fn(hidden_states) |
| hidden_states = self.fc2(hidden_states) |
| return hidden_states |
|
|
|
|
| class Siglip2EncoderLayer(nn.Module): |
| def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]): |
| super().__init__() |
| self.embed_dim = config.hidden_size |
| self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) |
| self.self_attn = Siglip2Attention(config) |
| self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) |
| self.mlp = Siglip2MLP(config) |
|
|
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| output_attentions: Optional[bool] = False, |
| rope_freqs_cis: Optional[torch.Tensor] = None, |
| win_meta_list: Optional[List[Dict]] = None, |
| windows_attn: Optional[bool] = False, |
| ) -> Tuple[torch.FloatTensor]: |
| """ |
| Args: |
| hidden_states (`torch.FloatTensor`): |
| Input to the layer of shape `(batch, seq_len, embed_dim)`. |
| attention_mask (`torch.FloatTensor`): |
| Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. |
| output_attentions (`bool`, *optional*, defaults to `False`): |
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
| returned tensors for more detail. |
| """ |
| residual = hidden_states |
|
|
| hidden_states = self.layer_norm1(hidden_states) |
| hidden_states, attn_weights = self.self_attn( |
| hidden_states=hidden_states, |
| output_attentions=output_attentions, |
| rope_freqs_cis=rope_freqs_cis, |
| win_meta_list=win_meta_list, |
| windows_attn=windows_attn, |
| ) |
| hidden_states = residual + hidden_states |
|
|
| residual = hidden_states |
| hidden_states = self.layer_norm2(hidden_states) |
| hidden_states = self.mlp(hidden_states) |
| hidden_states = residual + hidden_states |
|
|
| outputs = (hidden_states,) |
|
|
| if output_attentions: |
| outputs += (attn_weights,) |
|
|
| return outputs |
|
|
|
|
| class Siglip2Encoder(nn.Module): |
| """ |
| Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a |
| [`Siglip2EncoderLayer`]. |
| |
| Args: |
| config: Siglip2Config |
| """ |
|
|
| def __init__(self, config: Siglip2Config): |
| super().__init__() |
| self.config = config |
|
|
| self.rope_2d = Rope2DPosEmb( |
| config.hidden_size // config.num_attention_heads, 512, 512, config.window_size |
| ) |
| self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) |
| self.gradient_checkpointing = False |
| self.full_attention_indexes = config.full_attention_indexes |
| |
| |
| |
| @can_return_tuple |
| def forward( |
| self, |
| inputs_embeds, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| win_meta_list: Optional[List[Dict]] = None, |
| spatial_shapes: Optional[torch.Tensor] = None, |
| ) -> BaseModelOutput: |
| r""" |
| Args: |
| inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
| Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. |
| This is useful if you want more control over how to convert `input_ids` indices into associated vectors |
| than the model's internal embedding lookup matrix. |
| attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): |
| Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
| |
| - 1 for tokens that are **not masked**, |
| - 0 for tokens that are **masked**. |
| |
| [What are attention masks?](../glossary#attention-mask) |
| output_attentions (`bool`, *optional*): |
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
| returned tensors for more detail. |
| output_hidden_states (`bool`, *optional*): |
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors |
| for more detail. |
| return_dict (`bool`, *optional*): |
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
| """ |
| output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
| output_hidden_states = ( |
| output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
| ) |
|
|
| encoder_states = () if output_hidden_states else None |
| all_attentions = () if output_attentions else None |
| |
| rope_freqs_cis = self.rope_2d.get_freqs_cis(win_meta_list=win_meta_list, device=inputs_embeds.device) |
|
|
| hidden_states = inputs_embeds |
| for win_idx, encoder_layer in enumerate(self.layers): |
| |
| if win_idx not in self.full_attention_indexes: |
| windows_attn = True |
| else: |
| windows_attn = False |
| |
| if output_hidden_states: |
| encoder_states = encoder_states + (hidden_states,) |
| if self.gradient_checkpointing and self.training: |
| layer_outputs = self._gradient_checkpointing_func( |
| encoder_layer.__call__, |
| hidden_states, |
| output_attentions, |
| rope_freqs_cis, |
| win_meta_list, |
| windows_attn |
| ) |
| else: |
| layer_outputs = encoder_layer( |
| hidden_states, |
| output_attentions=output_attentions, |
| rope_freqs_cis=rope_freqs_cis, |
| win_meta_list=win_meta_list, |
| windows_attn=windows_attn |
| ) |
|
|
| hidden_states = layer_outputs[0] |
|
|
| if output_attentions: |
| all_attentions = all_attentions + (layer_outputs[1],) |
|
|
| if output_hidden_states: |
| encoder_states = encoder_states + (hidden_states,) |
|
|
| return BaseModelOutput( |
| last_hidden_state=hidden_states, |
| hidden_states=encoder_states, |
| attentions=all_attentions, |
| ) |
|
|
|
|
| def reconstruct_patch_embeddings(last_hidden_state: torch.Tensor, win_meta_list: list[dict], spatial_shapes: torch.Tensor) -> torch.Tensor: |
|
|
| idx_map = build_idx_map(win_meta_list, spatial_shapes) |
| last_hidden_state = last_hidden_state[:, idx_map, :] |
| return last_hidden_state |
|
|
| SIGLIP2_VISION_INPUTS_DOCSTRING = r""" |
| Args: |
| pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): |
| Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using |
| [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. |
| output_attentions (`bool`, *optional*): |
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned |
| tensors for more detail. |
| output_hidden_states (`bool`, *optional*): |
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
| more detail. |
| interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): |
| Whether to interpolate the pre-trained position encodings. |
| return_dict (`bool`, *optional*): |
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
| """ |
|
|
|
|
| class Siglip2VisionTransformer(nn.Module): |
| def __init__(self, config: Siglip2VisionConfig): |
| super().__init__() |
| self.config = config |
| embed_dim = config.hidden_size |
|
|
| self.embeddings = Siglip2VisionEmbeddings(config) |
| self.encoder = Siglip2Encoder(config) |
| self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) |
| self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head |
| if self.use_head: |
| self.head = Siglip2MultiheadAttentionPoolingHead(config) |
| self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" |
|
|
| @can_return_tuple |
| @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING) |
| @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig) |
| def forward( |
| self, |
| pixel_values: torch.FloatTensor, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| ) -> BaseModelOutputWithPooling: |
| r""" |
| Returns: |
| |
| """ |
| output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
| output_hidden_states = ( |
| output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
| ) |
|
|
| windows_tensor, win_meta_list, spatial_shapes, reverse_mapping = self.embeddings(pixel_values) |
| |
| encoder_outputs: BaseModelOutput = self.encoder( |
| inputs_embeds=windows_tensor, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| win_meta_list=win_meta_list, |
| spatial_shapes=spatial_shapes, |
| ) |
|
|
| last_hidden_state = encoder_outputs.last_hidden_state |
| last_hidden_state = self.post_layernorm(last_hidden_state) |
| last_hidden_state = last_hidden_state[:, reverse_mapping, :] |
| return Siglip2VisionOutput( |
| last_hidden_state=last_hidden_state, |
| hidden_states=encoder_outputs.hidden_states, |
| attentions=encoder_outputs.attentions, |
| spatial_shapes=spatial_shapes, |
| ) |
|
|
|
|
| def _trunc_normal_(tensor, mean, std, a, b): |
| |
| |
| def norm_cdf(x): |
| |
| return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 |
|
|
| if (mean < a - 2 * std) or (mean > b + 2 * std): |
| warnings.warn( |
| "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " |
| "The distribution of values may be incorrect.", |
| stacklevel=2, |
| ) |
|
|
| |
| |
| |
| l = norm_cdf((a - mean) / std) |
| u = norm_cdf((b - mean) / std) |
|
|
| |
| |
| tensor.uniform_(2 * l - 1, 2 * u - 1) |
|
|
| |
| |
| tensor.erfinv_() |
|
|
| |
| tensor.mul_(std * math.sqrt(2.0)) |
| tensor.add_(mean) |
|
|
| |
| tensor.clamp_(min=a, max=b) |
|
|
|
|
| def trunc_normal_tf_( |
| tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0 |
| ) -> torch.Tensor: |
| """Fills the input Tensor with values drawn from a truncated |
| normal distribution. The values are effectively drawn from the |
| normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)` |
| with values outside :math:`[a, b]` redrawn until they are within |
| the bounds. The method used for generating the random values works |
| best when :math:`a \\leq \text{mean} \\leq b`. |
| |
| NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the |
| bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 |
| and the result is subsequently scaled and shifted by the mean and std args. |
| |
| Args: |
| tensor: an n-dimensional `torch.Tensor` |
| mean: the mean of the normal distribution |
| std: the standard deviation of the normal distribution |
| a: the minimum cutoff value |
| b: the maximum cutoff value |
| """ |
| with torch.no_grad(): |
| _trunc_normal_(tensor, 0, 1.0, a, b) |
| tensor.mul_(std).add_(mean) |
|
|
|
|
| def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): |
| fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) |
| if mode == "fan_in": |
| denom = fan_in |
| elif mode == "fan_out": |
| denom = fan_out |
| elif mode == "fan_avg": |
| denom = (fan_in + fan_out) / 2 |
|
|
| variance = scale / denom |
|
|
| if distribution == "truncated_normal": |
| |
| trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) |
| elif distribution == "normal": |
| with torch.no_grad(): |
| tensor.normal_(std=math.sqrt(variance)) |
| elif distribution == "uniform": |
| bound = math.sqrt(3 * variance) |
| with torch.no_grad(): |
| tensor.uniform_(-bound, bound) |
| else: |
| raise ValueError(f"invalid distribution {distribution}") |
|
|
|
|
| def lecun_normal_(tensor): |
| variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") |
|
|
|
|
| def default_flax_embed_init(tensor): |
| variance_scaling_(tensor, mode="fan_in", distribution="normal") |
|
|
|
|
| SIGLIP2_START_DOCSTRING = r""" |
| This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the |
| library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads |
| etc.) |
| |
| This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. |
| Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage |
| and behavior. |
| |
| Parameters: |
| config ([`Siglip2Config`]): Model configuration class with all the parameters of the model. |
| Initializing with a config file does not load the weights associated with the model, only the |
| configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. |
| """ |
|
|
| SIGLIP2_INPUTS_DOCSTRING = r""" |
| Args: |
| input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): |
| Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide |
| it. |
| |
| Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and |
| [`PreTrainedTokenizer.__call__`] for details. |
| |
| [What are input IDs?](../glossary#input-ids) |
| attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): |
| Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: |
| |
| - 1 for tokens that are **not masked**, |
| - 0 for tokens that are **masked**. |
| |
| [What are attention masks?](../glossary#attention-mask) |
| position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): |
| Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, |
| config.max_position_embeddings - 1]`. |
| |
| [What are position IDs?](../glossary#position-ids) |
| pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): |
| Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using |
| [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. |
| return_loss (`bool`, *optional*): |
| Whether or not to return the contrastive loss. |
| output_attentions (`bool`, *optional*): |
| Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned |
| tensors for more detail. |
| output_hidden_states (`bool`, *optional*): |
| Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for |
| more detail. |
| interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): |
| Whether to interpolate the pre-trained position encodings. |
| return_dict (`bool`, *optional*): |
| Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
| """ |
|
|
|
|
| class Siglip2PreTrainedModel(PreTrainedModel): |
| """ |
| An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained |
| models. |
| """ |
|
|
| config_class = Siglip2Config |
| base_model_prefix = "siglip2" |
| supports_gradient_checkpointing = True |
|
|
| _no_split_modules = [ |
| "Siglip2TextEmbeddings", |
| "Siglip2EncoderLayer", |
| "Siglip2VisionEmbeddings", |
| "Siglip2EncoderLayer", |
| "Siglip2MultiheadAttentionPoolingHead", |
| ] |
| _supports_flash_attn_2 = True |
| _supports_sdpa = True |
|
|
| def _init_weights(self, module): |
| """Initialize the weights""" |
| if isinstance(module, Siglip2VisionEmbeddings): |
| width = ( |
| self.config.vision_config.hidden_size |
| if isinstance(self.config, Siglip2Config) |
| else self.config.hidden_size |
| ) |
| nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width)) |
| elif isinstance(module, nn.Embedding): |
| default_flax_embed_init(module.weight) |
| elif isinstance(module, Siglip2Attention): |
| nn.init.xavier_uniform_(module.q_proj.weight) |
| nn.init.xavier_uniform_(module.k_proj.weight) |
| nn.init.xavier_uniform_(module.v_proj.weight) |
| nn.init.xavier_uniform_(module.out_proj.weight) |
| nn.init.zeros_(module.q_proj.bias) |
| nn.init.zeros_(module.k_proj.bias) |
| nn.init.zeros_(module.v_proj.bias) |
| nn.init.zeros_(module.out_proj.bias) |
| elif isinstance(module, Siglip2MLP): |
| nn.init.xavier_uniform_(module.fc1.weight) |
| nn.init.xavier_uniform_(module.fc2.weight) |
| nn.init.normal_(module.fc1.bias, std=1e-6) |
| nn.init.normal_(module.fc2.bias, std=1e-6) |
| elif isinstance(module, Siglip2MultiheadAttentionPoolingHead): |
| nn.init.xavier_uniform_(module.probe.data) |
| nn.init.xavier_uniform_(module.attention.in_proj_weight.data) |
| nn.init.zeros_(module.attention.in_proj_bias.data) |
| elif isinstance(module, Siglip2Model): |
| logit_scale_init = torch.log(torch.tensor(1.0)) |
| module.logit_scale.data.fill_(logit_scale_init) |
| module.logit_bias.data.zero_() |
| elif isinstance(module, Siglip2ForImageClassification): |
| nn.init.normal_( |
| module.classifier.weight, |
| std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor, |
| ) |
| elif isinstance(module, (nn.Linear, nn.Conv2d)): |
| lecun_normal_(module.weight) |
| if module.bias is not None: |
| nn.init.zeros_(module.bias) |
| elif isinstance(module, nn.LayerNorm): |
| module.bias.data.zero_() |
| module.weight.data.fill_(1.0) |
|
|
|
|
| class Siglip2MultiheadAttentionPoolingHead(nn.Module): |
| """Multihead Attention Pooling.""" |
|
|
| def __init__(self, config: Siglip2VisionConfig): |
| super().__init__() |
|
|
| self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) |
| self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True) |
| self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) |
| self.mlp = Siglip2MLP(config) |
| self.num_heads = config.num_attention_heads |
|
|
| def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor: |
| batch_size = hidden_state.shape[0] |
| probe = self.probe.repeat(batch_size, 1, 1) |
|
|
| if attention_mask is not None: |
| target_len, source_len = probe.shape[1], hidden_state.shape[1] |
| attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len) |
| attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1) |
| attention_mask = attention_mask.reshape(-1, target_len, source_len) |
|
|
| hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0] |
|
|
| residual = hidden_state |
| hidden_state = self.layernorm(hidden_state) |
| hidden_state = residual + self.mlp(hidden_state) |
|
|
| return hidden_state[:, 0] |
|
|
|
|
| @add_start_docstrings( |
| """The vision model from Siglip2 without any head or projection on top.""", |
| SIGLIP2_START_DOCSTRING, |
| ) |
| class Siglip2VisionModel(Siglip2PreTrainedModel): |
| config_class = Siglip2VisionConfig |
| main_input_name = "pixel_values" |
|
|
| def __init__(self, config: Siglip2VisionConfig): |
| super().__init__(config) |
|
|
| self.vision_model = Siglip2VisionTransformer(config) |
|
|
| |
| self.post_init() |
|
|
| def get_input_embeddings(self) -> nn.Module: |
| return self.vision_model.embeddings.patch_embedding |
|
|
| @can_return_tuple |
| @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING) |
| @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig) |
| def forward( |
| self, |
| pixel_values: torch.FloatTensor, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| ) -> BaseModelOutputWithPooling: |
| r""" |
| Returns: |
| |
| Examples: |
| |
| ```python |
| >>> from PIL import Image |
| >>> import requests |
| >>> from transformers import AutoProcessor, Siglip2VisionModel |
| |
| >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224") |
| >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224") |
| |
| >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" |
| >>> image = Image.open(requests.get(url, stream=True).raw) |
| |
| >>> inputs = processor(images=image, return_tensors="pt") |
| |
| >>> outputs = model(**inputs) |
| >>> last_hidden_state = outputs.last_hidden_state |
| >>> pooled_output = outputs.pooler_output # pooled features |
| ```""" |
| return self.vision_model( |
| pixel_values=pixel_values, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| ) |
|
|
|
|
| __all__ = [ |
| "Siglip2VisionModel", |
| ] |