| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """PyTorch Aria vision transformer.""" |
|
|
| from typing import Optional, Tuple, Union |
|
|
| import torch |
| import torch.utils.checkpoint |
| from transformers import SiglipVisionConfig, SiglipVisionModel |
| from transformers.modeling_outputs import BaseModelOutputWithPooling |
| from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer |
|
|
|
|
| class AriaVisionConfig(SiglipVisionConfig): |
| """Configuration class for AriaVisionModel.""" |
|
|
| model_type = "aria_vision_model" |
|
|
| def __init__( |
| self, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
|
|
| class IdentityOp(torch.nn.Module): |
| """ |
| An identity operation that returns the input unchanged. |
| |
| This can be used as a placeholder or to maintain architectural consistency |
| when a specific operation is not needed. |
| """ |
|
|
| def __init__(self, *args, **kwargs): |
| super().__init__() |
|
|
| def forward(self, x, *args, **kwargs): |
| return x |
|
|
|
|
| class AriaVisionTransformer(Idefics2VisionTransformer): |
| """ |
| Aria Vision Transformer model based on Idefics2VisionTransformer. |
| |
| This class extends the original Idefics2VisionTransformer by removing the post-layernorm operation. |
| """ |
|
|
| def __init__(self, config: AriaVisionConfig): |
| super().__init__(config) |
| self.post_layernorm = IdentityOp() |
|
|
|
|
| class AriaVisionModel(SiglipVisionModel): |
| """ |
| Aria Vision Model extends SiglipVisionModel to support pixel_mask. |
| |
| The pixel_mask is a 2D boolean tensor that indicates which pixels in the input |
| image are actual content and which are padding. It has the same height and width |
| as the input image, where: |
| - True (1) values represent pixels from the original image |
| - False (0) values represent padding pixels |
| |
| This mask helps the model focus on the relevant parts of the image during processing. |
| """ |
|
|
| config_class = AriaVisionConfig |
| main_input_name = "pixel_values" |
| _supports_sdpa = False |
|
|
| def __init__(self, config: AriaVisionConfig): |
| super().__init__(config) |
| self.vision_model = AriaVisionTransformer(config) |
|
|
| |
| self.post_init() |
|
|
| def forward( |
| self, |
| pixel_values: torch.Tensor, |
| pixel_mask: Optional[torch.BoolTensor] = None, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| ) -> Union[Tuple, BaseModelOutputWithPooling]: |
| """ |
| Forward pass of the AriaVisionModel. |
| |
| Args: |
| pixel_values (torch.Tensor): The pixel values of the input images. |
| pixel_mask (Optional[torch.BoolTensor]): Mask for the pixel values. |
| output_attentions (Optional[bool]): Whether to output attentions. |
| output_hidden_states (Optional[bool]): Whether to output hidden states. |
| return_dict (Optional[bool]): Whether to return a ModelOutput object. |
| |
| Returns: |
| Union[Tuple, BaseModelOutputWithPooling]: The model's output. |
| """ |
| return_dict = ( |
| return_dict if return_dict is not None else self.config.use_return_dict |
| ) |
| patch_attention_mask = self._create_patch_attention_mask(pixel_mask) |
|
|
| vit_oup = self.vision_model( |
| pixel_values=pixel_values, |
| patch_attention_mask=patch_attention_mask, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| return_dict=return_dict, |
| ) |
|
|
| image_atts = self._create_image_attention_mask(patch_attention_mask) |
|
|
| return vit_oup, image_atts |
|
|
| def _create_patch_attention_mask(self, pixel_mask): |
| if pixel_mask is None: |
| return None |
|
|
| patches_subgrid = pixel_mask.unfold( |
| dimension=1, |
| size=self.vision_model.config.patch_size, |
| step=self.vision_model.config.patch_size, |
| ).unfold( |
| dimension=2, |
| size=self.vision_model.config.patch_size, |
| step=self.vision_model.config.patch_size, |
| ) |
| return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() |
|
|
| def _create_image_attention_mask(self, patch_attention_mask): |
| if patch_attention_mask is None: |
| return None |
|
|
| flattened_mask = patch_attention_mask.flatten(1) |
| return torch.logical_not(flattened_mask) |
|
|