transformers / examples /modular-transformers /modeling_from_uppercase_model.py

Upload folder using huggingface_hub

a9bd396 verified about 1 month ago

6.27 kB

	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
	# This file was automatically generated from examples/modular-transformers/modular_from_uppercase_model.py.
	# Do NOT edit this file manually as any edits will be overwritten by the generation of
	# the file from the modular. If any change should be done, please apply the change to the
	# modular_from_uppercase_model.py file directly. One of our CI enforces this.
	# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨

	from collections.abc import Callable

	import torch
	from torch import nn

	from ...activations import ACT2FN
	from ...modeling_layers import GradientCheckpointingLayer
	from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
	from ...processing_utils import Unpack
	from ...utils import TransformersKwargs
	from .configuration_from_uppercase_model import FromUppercaseModelTextConfig, FromUppercaseModelVisionConfig


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: torch.Tensor \| None,
	scaling: float,
	dropout: float = 0.0,
	**kwargs: Unpack[TransformersKwargs],
	):
	attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
	if attention_mask is not None:
	attn_weights = attn_weights + attention_mask
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)

	attn_output = torch.matmul(attn_weights, value)
	attn_output = attn_output.transpose(1, 2).contiguous()
	return attn_output, attn_weights


	class FromUppercaseModelAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	def __init__(self, config: FromUppercaseModelVisionConfig \| FromUppercaseModelTextConfig):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout
	self.is_causal = False

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor \| None = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> tuple[torch.Tensor, torch.Tensor \| None]:
	"""Input shape: Batch x Time x Channel"""

	batch_size, seq_length, embed_dim = hidden_states.shape

	queries = self.q_proj(hidden_states)
	keys = self.k_proj(hidden_states)
	values = self.v_proj(hidden_states)

	queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
	keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
	values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	attn_output, attn_weights = attention_interface(
	self,
	queries,
	keys,
	values,
	attention_mask,
	scaling=self.scale,
	dropout=0.0 if not self.training else self.dropout,
	**kwargs,
	)

	attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights


	class FromUppercaseModelMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states


	class FromUppercaseModelEncoderLayer(GradientCheckpointingLayer):
	def __init__(self, config: FromUppercaseModelVisionConfig \| FromUppercaseModelTextConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.self_attn = FromUppercaseModelAttention(config)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = FromUppercaseModelMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	**kwargs: Unpack[TransformersKwargs],
	) -> torch.FloatTensor:
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, _ = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states