MRI / venv /lib /python3.13 /site-packages /transformers /models /aimv2 /modeling_aimv2.py

Add files using upload-large-folder tool

b9ae124 verified about 1 month ago

29 kB

	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
	# This file was automatically generated from src/transformers/models/aimv2/modular_aimv2.py.
	# Do NOT edit this file manually as any edits will be overwritten by the generation of
	# the file from the modular. If any change should be done, please apply the change to the
	# modular_aimv2.py file directly. One of our CI enforces this.
	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
	# coding=utf-8
	# Copyright 2025 Apple Inc. and The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import math
	from dataclasses import dataclass
	from typing import Any, Callable, Optional

	import torch
	import torch.nn.functional as F
	from torch import nn

	from ...activations import ACT2FN
	from ...integrations import use_kernel_forward_from_hub
	from ...masking_utils import create_causal_mask
	from ...modeling_layers import GradientCheckpointingLayer
	from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
	from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
	from ...processing_utils import Unpack
	from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs
	from ...utils.deprecation import deprecate_kwarg
	from ...utils.generic import check_model_inputs
	from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig


	@dataclass
	@auto_docstring
	class Aimv2Output(ModelOutput):
	r"""
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `return_loss` is `True`):
	Contrastive loss for image-text similarity.
	logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
	The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
	similarity scores.
	logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
	The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
	similarity scores.
	text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
	The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
	image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
	The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
	text_model_output (`BaseModelOutputWithPooling`):
	The output of the [`Aimv2TextModel`].
	vision_model_output (`BaseModelOutputWithPooling`):
	The output of the [`Aimv2VisionModel`].
	"""

	loss: Optional[torch.FloatTensor] = None
	logits_per_image: Optional[torch.FloatTensor] = None
	logits_per_text: Optional[torch.FloatTensor] = None
	text_embeds: Optional[torch.FloatTensor] = None
	image_embeds: Optional[torch.FloatTensor] = None
	text_model_output: BaseModelOutputWithPooling = None
	vision_model_output: BaseModelOutputWithPooling = None

	def to_tuple(self) -> tuple[Any]:
	return tuple(
	self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
	for k in self.keys()
	)


	@use_kernel_forward_from_hub("RMSNorm")
	class Aimv2RMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	Aimv2RMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


	class Aimv2MLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
	self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
	self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, x):
	down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
	return down_proj


	class Aimv2VisionEmbeddings(nn.Module):
	def __init__(self, config: Aimv2VisionConfig):
	super().__init__()
	self.config = config
	self.patch_size = config.patch_size
	self.patch_embed = nn.Conv2d(
	config.num_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size
	)
	self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)

	num_patches = (config.image_size // config.patch_size) ** 2
	if not self.config.is_native:
	self.position_embedding = nn.Embedding(num_patches, config.hidden_size)
	self.register_buffer("position_ids", torch.arange(num_patches).expand((1, -1)), persistent=False)

	@staticmethod
	def build_2d_sincos_position_embedding(
	height, width, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
	) -> torch.Tensor:
	grid_w = torch.arange(int(width), dtype=dtype, device=device)
	grid_h = torch.arange(int(height), dtype=dtype, device=device)
	grid_h, grid_w = torch.meshgrid(grid_w, grid_h, indexing="xy")

	pos_dim = embed_dim // 4
	omega = torch.arange(pos_dim, dtype=dtype, device=device) / pos_dim
	omega = 1.0 / (temperature**omega)

	out_h = grid_h.flatten()[..., None] @ omega[None, :]
	out_w = grid_w.flatten()[..., None] @ omega[None, :]

	return torch.concat([out_h.sin(), out_h.cos(), out_w.sin(), out_w.cos()], dim=1)[None, :, :]

	def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
	_, _, height, width = pixel_values.size()
	hidden_states = self.patch_embed(pixel_values).flatten(2).transpose(1, 2)
	hidden_states = self.rms_norm(hidden_states)

	if self.config.is_native:
	pos_embed = self.build_2d_sincos_position_embedding(
	height // self.patch_size,
	width // self.patch_size,
	embed_dim=self.config.hidden_size,
	device=hidden_states.device,
	dtype=hidden_states.dtype,
	)
	else:
	pos_embed = self.position_embedding(self.position_ids)

	hidden_states = hidden_states + pos_embed
	return hidden_states


	class Aimv2TextEmbeddings(nn.Module):
	def __init__(self, config: Aimv2TextConfig):
	super().__init__()
	embed_dim = config.hidden_size

	self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
	self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

	# position_ids (1, len position emb) is contiguous in memory and exported when serialized
	self.register_buffer(
	"position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
	)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	) -> torch.Tensor:
	seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
	max_position_embedding = self.position_embedding.weight.shape[0]

	if seq_length > max_position_embedding:
	raise ValueError(
	f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
	f"{seq_length} and max_position_embeddings: {max_position_embedding}"
	)

	if position_ids is None:
	position_ids = self.position_ids[:, :seq_length]

	if inputs_embeds is None:
	inputs_embeds = self.token_embedding(input_ids)

	position_embeddings = self.position_embedding(position_ids)
	embeddings = inputs_embeds + position_embeddings

	return embeddings


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	scaling: float,
	dropout: float = 0.0,
	**kwargs,
	):
	attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
	if attention_mask is not None:
	attn_weights = attn_weights + attention_mask

	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)

	attn_output = torch.matmul(attn_weights, value)
	attn_output = attn_output.transpose(1, 2).contiguous()

	return attn_output, attn_weights


	class Aimv2Attention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout
	self.is_causal = False
	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.qkv_bias)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs,
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	"""Input shape: Batch x Time x Channel"""

	batch_size, seq_length, embed_dim = hidden_states.shape

	queries = self.q_proj(hidden_states)
	keys = self.k_proj(hidden_states)
	values = self.v_proj(hidden_states)

	queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
	keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
	values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	attn_output, attn_weights = attention_interface(
	self,
	queries,
	keys,
	values,
	attention_mask,
	is_causal=self.is_causal,
	scaling=self.scale,
	dropout=0.0 if not self.training else self.dropout,
	)

	attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights


	class Aimv2EncoderLayer(GradientCheckpointingLayer):
	def __init__(self, config: Aimv2VisionConfig):
	super().__init__()
	self.attention = Aimv2Attention(config)
	self.ffn = Aimv2MLP(config)
	self.rms_norm1 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)
	self.rms_norm2 = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> torch.Tensor:
	norm_hidden_states = self.rms_norm1(hidden_states)
	attn_output, _ = self.attention(hidden_states=norm_hidden_states, attention_mask=attention_mask, **kwargs)

	hidden_states = hidden_states + attn_output
	norm_hidden_states = self.rms_norm2(hidden_states)
	mlp_output = self.ffn(norm_hidden_states)

	hidden_states = hidden_states + mlp_output
	return hidden_states


	class Aimv2Encoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
	[`Aimv2EncoderLayer`].

	Args:
	config: Aimv2Config
	"""

	def __init__(self, config: Aimv2Config):
	super().__init__()
	self.config = config
	self.layers = nn.ModuleList([Aimv2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False

	# Ignore copy
	@auto_docstring
	def forward(
	self,
	inputs_embeds,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> BaseModelOutput:
	hidden_states = inputs_embeds
	for encoder_layer in self.layers:
	hidden_states = encoder_layer(
	hidden_states,
	attention_mask,
	**kwargs,
	)

	return BaseModelOutput(last_hidden_state=hidden_states)


	class Aimv2AttentionPoolingHead(nn.Module):
	def __init__(self, config: Aimv2VisionConfig):
	super().__init__()
	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads

	self.k_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)
	self.v_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.qkv_bias)

	self.cls_token = nn.Parameter(torch.zeros(1, 1, self.hidden_size))
	self.output_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=True)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	batch_size, seq_len, hidden_dim = hidden_states.shape

	cls_token = self.cls_token.expand(batch_size, -1, -1)

	key = self.k_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
	value = self.v_proj(hidden_states).reshape(batch_size, seq_len, self.num_heads, hidden_dim // self.num_heads)
	query = cls_token.reshape(batch_size, 1, self.num_heads, hidden_dim // self.num_heads)

	key = key.permute(0, 2, 1, 3)
	value = value.permute(0, 2, 1, 3)
	query = query.permute(0, 2, 1, 3)

	attn_output = F.scaled_dot_product_attention(query, key, value)

	attn_output = attn_output.transpose(1, 2).reshape(batch_size, 1, hidden_dim)
	attn_output = attn_output.mean(dim=1)

	output = self.output_proj(attn_output)
	return output


	@auto_docstring
	class Aimv2PreTrainedModel(PreTrainedModel):
	"""
	An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
	models. The model is only intended for inference and doesn't support finetuning.
	"""

	config: Aimv2Config
	base_model_prefix = "aimv2"
	supports_gradient_checkpointing = True
	_no_split_modules = [
	"Aimv2EncoderLayer",
	"Aimv2AttentionPoolingHead",
	"Aimv2VisionEmbeddings",
	"Aimv2TextEmbeddings",
	]
	_supports_sdpa = True
	_supports_flash_attn = True
	_supports_flex_attn = True

	def _init_weights(self, module):
	super()._init_weights(module)
	if hasattr(module, "logit_scale"):
	if isinstance(module.logit_scale, nn.Parameter):
	module.logit_scale.data.fill_(math.log(1 / 0.07))
	elif isinstance(module, Aimv2AttentionPoolingHead):
	module.cls_token.data.normal_(mean=0.0, std=self.config.initializer_range)


	@auto_docstring(
	custom_intro="""
	The Vision model from AIMv2 without any head or projection on top.
	"""
	)
	class Aimv2VisionModel(Aimv2PreTrainedModel):
	config: Aimv2VisionConfig
	main_input_name = "pixel_values"
	_can_record_outputs = {
	"hidden_states": Aimv2EncoderLayer,
	"attentions": Aimv2Attention,
	}

	def __init__(self, config: Aimv2VisionConfig):
	super().__init__(config)
	self.config = config
	self.embeddings = Aimv2VisionEmbeddings(config)
	self.encoder = Aimv2Encoder(config)
	# The only change from SiglipVisionTransformer is, layernorm -> rms_norm.
	self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)

	self.use_head = config.use_head
	if self.use_head:
	self.head = Aimv2AttentionPoolingHead(config)

	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.embeddings.patch_embed

	@deprecate_kwarg("attention_mask", version="v4.58.0")
	@check_model_inputs(tie_last_hidden_states=False)
	@auto_docstring
	def forward(
	self,
	pixel_values,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> BaseModelOutputWithPooling:
	r"""
	Examples:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, Siglip2VisionModel

	>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
	>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(images=image, return_tensors="pt")

	>>> outputs = model(**inputs)
	>>> last_hidden_state = outputs.last_hidden_state
	>>> pooled_output = outputs.pooler_output # pooled features
	```"""
	hidden_states = self.embeddings(pixel_values)

	encoder_outputs: BaseModelOutput = self.encoder(
	inputs_embeds=hidden_states,
	**kwargs,
	)

	last_hidden_state = encoder_outputs.last_hidden_state
	last_hidden_state = self.rms_norm(last_hidden_state)

	pooler_output = self.head(last_hidden_state) if self.use_head else None

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooler_output,
	)


	@auto_docstring(
	custom_intro="""
	The text model from AIMv2 without any head or projection on top.
	"""
	)
	class Aimv2TextModel(Aimv2PreTrainedModel):
	main_input_name = "input_ids"

	_can_record_outputs = {
	"hidden_states": Aimv2EncoderLayer,
	"attentions": Aimv2Attention,
	}

	def __init__(self, config: Aimv2TextConfig):
	super().__init__(config)
	self.config = config
	self.embeddings = Aimv2TextEmbeddings(config)
	self.encoder = Aimv2Encoder(config)
	self.rms_norm = Aimv2RMSNorm(config.hidden_size, config.rms_norm_eps)

	self.eos_token_id = config.eos_token_id

	self.post_init()

	def get_input_embeddings(self) -> nn.Module:
	return self.embeddings.token_embedding

	def set_input_embeddings(self, value):
	self.embeddings.token_embedding = value

	@check_model_inputs(tie_last_hidden_states=False)
	@auto_docstring
	def forward(
	self,
	input_ids,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> BaseModelOutputWithPooling:
	hidden_states = self.embeddings(input_ids)
	batch_size, seq_len, _ = hidden_states.shape

	cache_position = torch.arange(seq_len, dtype=torch.long, device=hidden_states.device)
	position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
	if attention_mask is not None:
	attention_mask = create_causal_mask(
	config=self.config,
	input_embeds=hidden_states,
	position_ids=position_ids,
	attention_mask=attention_mask,
	cache_position=cache_position,
	past_key_values=None,
	)

	encoder_outputs = self.encoder(
	inputs_embeds=hidden_states,
	attention_mask=attention_mask,
	**kwargs,
	)

	last_hidden_state = encoder_outputs.last_hidden_state
	last_hidden_state = self.rms_norm(last_hidden_state)

	# Get pooled output
	pooled_output = last_hidden_state[
	torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
	(input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id).int().argmax(dim=-1),
	]

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled_output,
	)


	def _get_vector_norm(tensor: torch.Tensor) -> torch.Tensor:
	"""
	This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
	model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
	"""
	square_tensor = torch.pow(tensor, 2)
	sum_tensor = torch.sum(square_tensor, dim=-1, keepdim=True)
	normed_tensor = torch.pow(sum_tensor, 0.5)
	return normed_tensor


	@auto_docstring
	class Aimv2Model(Aimv2PreTrainedModel):
	config: Aimv2Config
	_no_split_modules = ["Aimv2TextEmbeddings", "Aimv2EncoderLayer", "Aimv2VisionEmbeddings"]
	_supports_flash_attn = True

	def __init__(self, config: Aimv2Config):
	super().__init__(config)

	self.projection_dim = config.projection_dim
	self.vision_embed_dim = config.vision_config.hidden_size
	self.text_embed_dim = config.text_config.hidden_size

	self.vision_model = Aimv2VisionModel._from_config(config.vision_config)
	self.text_model = Aimv2TextModel._from_config(config.text_config)

	self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
	self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

	self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
	self.max_log_logit_scale = math.log(config.max_logit_scale)

	self.post_init()

	@filter_out_non_signature_kwargs()
	@auto_docstring
	def get_text_features(
	self,
	input_ids: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	) -> torch.FloatTensor:
	r"""
	Returns:
	text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
	applying the projection layer to the pooled output of [`Aimv2TextModel`].

	Examples:

	```python
	>>> import torch
	>>> from transformers import AutoTokenizer, Aimv2Model

	>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
	>>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

	>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

	>>> with torch.inference_mode():
	... text_features = model.get_text_features(**inputs)
	```"""
	text_outputs: BaseModelOutputWithPooling = self.text_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	)
	pooled_output = text_outputs.pooler_output
	text_features = self.text_projection(pooled_output)

	return text_features

	@filter_out_non_signature_kwargs()
	@auto_docstring
	def get_image_features(
	self,
	pixel_values: torch.FloatTensor,
	interpolate_pos_encoding: bool = False,
	) -> torch.FloatTensor:
	r"""
	Returns:
	image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
	applying the projection layer to the pooled output of [`Aimv2VisionModel`].

	Examples:

	```python
	>>> import torch
	>>> from transformers import AutoProcessor, Aimv2Model
	>>> from transformers.image_utils import load_image

	>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
	>>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = load_image(url)

	>>> inputs = processor(images=image, return_tensors="pt")

	>>> with torch.inference_mode():
	... image_features = model.get_image_features(**inputs)
	```"""
	vision_outputs: BaseModelOutputWithPooling = self.vision_model(
	pixel_values=pixel_values,
	interpolate_pos_encoding=interpolate_pos_encoding,
	)
	pooled_output = vision_outputs.pooler_output
	image_features = self.visual_projection(pooled_output)

	return image_features

	@auto_docstring
	@can_return_tuple
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> Aimv2Output:
	r"""
	Examples:

	```python
	>>> from PIL import Image
	>>> import requests
	>>> from transformers import AutoProcessor, Aimv2Model

	>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
	>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

	>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
	>>> image = Image.open(requests.get(url, stream=True).raw)

	>>> inputs = processor(
	... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
	... )

	>>> outputs = model(**inputs)
	>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
	>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
	```"""
	vision_outputs: BaseModelOutputWithPooling = self.vision_model(
	pixel_values=pixel_values,
	**kwargs,
	)

	text_outputs: BaseModelOutputWithPooling = self.text_model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	**kwargs,
	)

	image_embeds = vision_outputs.pooler_output
	image_embeds = self.visual_projection(image_embeds)

	text_embeds = text_outputs.pooler_output
	text_embeds = self.text_projection(text_embeds)

	# normalized features
	image_embeds = image_embeds / _get_vector_norm(image_embeds)
	text_embeds = text_embeds / _get_vector_norm(text_embeds)

	logit_scale = self.logit_scale.clamp(0.0, self.max_log_logit_scale).exp().to(text_embeds.device)
	logits_per_text = (logit_scale * text_embeds) @ image_embeds.t()
	logits_per_image = logits_per_text.t()

	return Aimv2Output(
	logits_per_image=logits_per_image,
	logits_per_text=logits_per_text,
	text_embeds=text_embeds,
	image_embeds=image_embeds,
	text_model_output=text_outputs,
	vision_model_output=vision_outputs,
	)


	__all__ = ["Aimv2VisionModel", "Aimv2Model", "Aimv2PreTrainedModel", "Aimv2TextModel"]