Upload AVHubertForConditionalGeneration

Browse files

Files changed (9) hide show

README.md +199 -0
config.json +0 -0
configuration_avhubert.py +151 -0
configuration_resnet.py +17 -0
decoder.py +1097 -0
generation_config.json +7 -0
modeling_avhubert.py +391 -0
modeling_resnet.py +178 -0
pytorch_model.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configuration_avhubert.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from transformers import HubertConfig, PretrainedConfig
+class AVHubertConfig(PretrainedConfig):
+    model_type: str = "avhubert"
+    def __init__(
+        self,
+        label_rate: int = 100,
+        encoder_layers: int = 12,
+        encoder_embed_dim: int = 768,
+        encoder_ffn_embed_dim: int = 3072,
+        encoder_attention_heads: int = 12,
+        activation_fn: str = "gelu",
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.0,
+        encoder_layerdrop: float = 0.0,
+        dropout_input: float = 0.0,
+        conv_dim: tuple[int, ...] = (512, 512, 512, 512, 512, 512, 512),
+        conv_stride: tuple[int, ...] = (5, 2, 2, 2, 2, 2, 2),
+        conv_kernel: tuple[int, ...] = (10, 3, 3, 3, 3, 2, 2),
+        conv_bias: bool = False,
+        conv_pos: int = 128,
+        conv_pos_groups: int = 16,
+        resnet_relu_type: str = "prelu",
+        audio_feat_dim: int = 104,
+        modality_fuse: str = "concat",
+        decoder_embed_dim: int = 768,
+        decoder_ffn_embed_dim: int = 3072,
+        decoder_layers: int = 6,
+        decoder_layerdrop: float = 0.0,
+        decoder_attention_heads: int = 4,
+        decoder_learned_pos: bool = False,
+        decoder_normalize_before: bool = False,
+        no_token_positional_embeddings: bool = False,
+        decoder_dropout: float = 0.1,
+        decoder_attention_dropout: float = 0.1,
+        decoder_activation_dropout: float = 0.0,
+        max_target_positions: int = 2048,
+        share_decoder_input_output_embed: bool = False,
+        no_scale_embedding: bool = True,
+        sample_rate: int = 25,
+        num_labels: int = 100,
+        initializer_range: float = 0.02,
+        do_stable_layer_norm: bool = False,
+        vocab_size: int | None = None,
+        freeze_feature_encoder: bool = False,
+        freeze_base_model: bool = False,
+        ctc_loss_reduction: str = "mean",
+        ctc_zero_infinity: bool = False,
+        ctc_loss_weight: float = 0.3,
+        special_ids: list[int] | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.label_rate = label_rate
+        self.encoder_layers = encoder_layers
+        self.encoder_embed_dim = encoder_embed_dim
+        self.encoder_ffn_embed_dim = encoder_ffn_embed_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.activation_fn = activation_fn
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.encoder_layerdrop = encoder_layerdrop
+        self.dropout_input = dropout_input
+        self.conv_dim = conv_dim
+        self.conv_kernel = conv_kernel
+        self.conv_stride = conv_stride
+        self.conv_bias = conv_bias
+        self.conv_pos = conv_pos
+        self.conv_pos_groups = conv_pos_groups
+        self.resnet_relu_type = resnet_relu_type
+        self.audio_feat_dim = audio_feat_dim
+        self.modality_fuse = modality_fuse
+        self.decoder_embed_dim = decoder_embed_dim
+        self.decoder_ffn_embed_dim = decoder_ffn_embed_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_layerdrop = decoder_layerdrop
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_learned_pos = decoder_learned_pos
+        self.decoder_normalize_before = decoder_normalize_before
+        self.no_token_positional_embeddings = no_token_positional_embeddings
+        self.decoder_dropout = decoder_dropout
+        self.decoder_attention_dropout = decoder_attention_dropout
+        self.decoder_activation_dropout = decoder_activation_dropout
+        self.max_target_positions = max_target_positions
+        self.share_decoder_input_output_embed = share_decoder_input_output_embed
+        self.no_scale_embedding = no_scale_embedding
+        self.sample_rate = sample_rate
+        self.num_labels = num_labels
+        self.initializer_range = initializer_range
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.vocab_size = vocab_size
+        self.freeze_feature_encoder = freeze_feature_encoder
+        self.freeze_base_model = freeze_base_model
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+        self.ctc_loss_weight = ctc_loss_weight
+        self.special_ids = special_ids
+    @property
+    def encoder_config(self) -> HubertConfig:
+        return HubertConfig(
+            hidden_size=self.encoder_embed_dim,
+            num_hidden_layers=self.encoder_layers,
+            num_attention_heads=self.encoder_attention_heads,
+            intermediate_size=self.encoder_ffn_embed_dim,
+            hidden_act=self.activation_fn,
+            hidden_dropout=self.dropout,
+            activation_dropout=self.activation_dropout,
+            attention_dropout=self.attention_dropout,
+            layerdrop=self.encoder_layerdrop,
+            conv_dim=self.conv_dim,
+            conv_kernel=self.conv_kernel,
+            conv_stride=self.conv_stride,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.conv_pos,
+            num_conv_pos_embedding_groups=self.conv_pos_groups,
+            feat_extract_activation="gelu",
+            do_stable_layer_norm=self.do_stable_layer_norm,
+            max_position_embeddings=self.max_target_positions,
+            learned_pos=self.decoder_learned_pos,
+            share_input_output_embed=self.share_decoder_input_output_embed,
+        )
+    @property
+    def decoder_config(self) -> HubertConfig:
+        return HubertConfig(
+            hidden_size=self.decoder_embed_dim,
+            num_hidden_layers=self.decoder_layers,
+            num_attention_heads=self.decoder_attention_heads,
+            intermediate_size=self.decoder_ffn_embed_dim,
+            hidden_act=self.activation_fn,
+            hidden_dropout=self.decoder_dropout,
+            activation_dropout=self.decoder_activation_dropout,
+            attention_dropout=self.decoder_attention_dropout,
+            layerdrop=self.decoder_layerdrop,
+            conv_dim=self.conv_dim,
+            conv_kernel=self.conv_kernel,
+            conv_stride=self.conv_stride,
+            conv_bias=self.conv_bias,
+            num_conv_pos_embeddings=self.conv_pos,
+            num_conv_pos_embedding_groups=self.conv_pos_groups,
+            feat_extract_activation="gelu",
+            do_stable_layer_norm=self.do_stable_layer_norm,
+            max_position_embeddings=self.max_target_positions,
+            learned_pos=self.decoder_learned_pos,
+            share_input_output_embed=self.share_decoder_input_output_embed,
+        )

configuration_resnet.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import PretrainedConfig
+class ResEncoderConfig(PretrainedConfig):
+    model_type = "modified_resnet"
+    def __init__(
+        self,
+        relu_type="prelu",
+        frontend_nout=64,
+        backend_out=512,
+        **kwargs,
+    ):
+        self.relu_type = relu_type
+        self.frontend_nout = frontend_nout
+        self.backend_out = backend_out
+        super().__init__(**kwargs)

decoder.py ADDED Viewed

	@@ -0,0 +1,1097 @@

+from typing import Callable, Optional, Tuple, TypedDict, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers.cache_utils import Cache, EncoderDecoderCache, StaticCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from transformers.models.hubert.configuration_hubert import HubertConfig
+from transformers.models.hubert.modeling_hubert import (
+    HubertAttnAdapterLayer,
+    HubertFeedForward,
+    is_deepspeed_zero3_enabled,
+)
+from transformers.utils import is_torchdynamo_compiling, logging
+from typing_extensions import Unpack
+logger = logging.get_logger(__name__)
+# Copied from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flash_attention_utils.py#L428
+class FlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for Flash Attention with Compile.
+    Attributes:
+        cumulative_seqlens_q (`torch.LongTensor`, *optional*)
+            Gets cumulative sequence length for query state.
+        cumulative_seqlens_k (`torch.LongTensor`, *optional*)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`, *optional*):
+            Maximum sequence length for query state.
+        max_length_k (`int`, *optional*):
+            Maximum sequence length for key state.
+    """
+    cumulative_seqlens_q: Optional[torch.LongTensor]
+    cumulative_seqlens_k: Optional[torch.LongTensor]
+    max_length_q: Optional[int]
+    max_length_k: Optional[int]
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        weight = torch.empty(
+            (
+                config.max_position_embeddings,
+                config.hidden_size,
+            ),
+            requires_grad=False,
+        )
+        self._init_sinusoidal_embedding(weight)
+        self.register_buffer("position_embeddings", weight)
+    def _init_sinusoidal_embedding(self, embeddings: torch.Tensor) -> None:
+        T, D = embeddings.size()
+        position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / D) for j in range(D)] for pos in range(T)])
+        embeddings[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        embeddings[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        past_key_values_length: int = 0,  # Offset
+        position_ids: torch.LongTensor | None = None,
+    ) -> torch.Tensor:
+        if position_ids is None:
+            bsz, seq_len = inputs.shape[:2]
+            position_ids = torch.arange(
+                past_key_values_length,
+                past_key_values_length + seq_len,
+                dtype=torch.long,
+                device=self.position_embeddings.device,
+            ).expand(bsz, -1)
+        else:
+            position_ids = position_ids.unsqueeze(0)
+        return self.position_embeddings[position_ids]
+# Copied from https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/bart/modeling_bart.py#L116
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        # self.offset = 2
+        # super().__init__(num_embeddings + self.offset, embedding_dim)
+        super().__init__(num_embeddings, embedding_dim)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values_length: int = 0,
+        position_ids: torch.LongTensor = None,
+    ):
+        """`input_ids' shape is expected to be [bsz x seqlen]."""
+        if position_ids is None:
+            bsz, seq_len = input_ids.shape[:2]
+            position_ids = torch.arange(
+                past_key_values_length,
+                past_key_values_length + seq_len,
+                dtype=torch.long,
+                device=self.weight.device,
+            ).expand(bsz, -1)
+        else:
+            position_ids = position_ids.unsqueeze(0)
+        # return super().forward(positions + self.offset)
+        return super().forward(position_ids)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class AVHubertAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[HubertConfig] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+        if past_key_value is not None:
+            if isinstance(past_key_value, EncoderDecoderCache):
+                is_updated = past_key_value.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_value.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_value.self_attention_cache
+            else:
+                curr_past_key_value = past_key_value
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_value is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.key_cache[self.layer_idx]
+            value_states = curr_past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                    {"cache_position": cache_position},
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_value.is_updated[self.layer_idx] = True
+        attention_interface: Callable = eager_attention_forward
+        # TODO: attn implementation other than eager attention
+        # if self.config._attn_implementation != "eager":
+        #     attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights, past_key_value
+class AVHubertDecoderLayer(nn.Module):
+    def __init__(self, config: HubertConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.attention = AVHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder_attn = AVHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = HubertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        encoder_hidden_states: torch.Tensor | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states, self_attn_weights, past_key_value = self.attention(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states, cross_attn_weights, _ = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + residual
+            hidden_states = self.encoder_layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (past_key_value,)
+        return outputs
+class AVHubertDecoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config: HubertConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.attention = AVHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder_attn = AVHubertAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = HubertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = HubertAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        encoder_hidden_states: torch.Tensor | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, self_attn_weights, past_key_value = self.attention(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights, _ = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (past_key_value,)
+        return outputs
+class AVHubertDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.learned_pos:
+            self.pos_embed = LearnedPositionalEmbedding(
+                num_embeddings=config.max_position_embeddings,
+                embedding_dim=config.hidden_size,
+            )
+        else:
+            self.pos_embed = SinusoidalPositionalEmbedding(config=config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([AVHubertDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        encoder_hidden_states: torch.Tensor | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+        head_mask: torch.Tensor | None = None,
+        cross_attn_head_mask: torch.Tensor | None = None,
+        past_key_values: EncoderDecoderCache | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        cache_position: torch.LongTensor | None = None,
+    ):
+        input_shape = inputs_embeds.shape[:-1]
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache.from_legacy_cache()
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        batch_size, seq_length = inputs_embeds.size()[:-1]
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length,
+                past_key_values_length + seq_length,
+                device=inputs_embeds.device,
+            )
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        self_attn_cache = (
+            past_key_values.self_attention_cache
+            if isinstance(past_key_values, EncoderDecoderCache)
+            else past_key_values
+        )
+        attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, self_attn_cache)
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states, encoder_attention_mask, input_shape, inputs_embeds
+        )
+        # embed positions
+        position_embeddings = self.pos_embed(inputs_embeds, past_key_values_length, position_ids=cache_position)
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        output_attentions,
+                    )
+                    raise NotImplementedError("Currently, gradient checkpointing is not supported.")
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        cross_attn_layer_head_mask=(
+                            cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                        ),
+                        past_key_value=past_key_values,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                        cache_position=cache_position,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None, None, None)
+            if use_cache:
+                next_decoder_cache = layer_outputs[3 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+    ):
+        if self.config._attn_implementation == "flex_attention":
+            raise NotImplementedError
+        if self.config._attn_implementation == "flash_attention_2":
+            raise NotImplementedError
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = True if isinstance(past_key_values, StaticCache) else False
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=cache_position.device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                raise NotImplementedError
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+        return encoder_attention_mask
+class AVHubertDecoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if config.learned_pos:
+            self.pos_embed = LearnedPositionalEmbedding(
+                num_embeddings=config.max_position_embeddings,
+                embedding_dim=config.hidden_size,
+            )
+        else:
+            self.pos_embed = SinusoidalPositionalEmbedding(config=config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                AVHubertDecoderLayerStableLayerNorm(config, layer_idx=layer_idx)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        encoder_hidden_states: torch.Tensor | None = None,
+        encoder_attention_mask: torch.Tensor | None = None,
+        head_mask: torch.Tensor | None = None,
+        cross_attn_head_mask: torch.Tensor | None = None,
+        past_key_values: EncoderDecoderCache | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        cache_position: torch.LongTensor | None = None,
+    ):
+        input_shape = inputs_embeds.shape[:-1]
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache.from_legacy_cache()
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        batch_size, seq_length = inputs_embeds.size()[:-1]
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length,
+                past_key_values_length + seq_length,
+                device=inputs_embeds.device,
+            )
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+        self_attn_cache = (
+            past_key_values.self_attention_cache
+            if isinstance(past_key_values, EncoderDecoderCache)
+            else past_key_values
+        )
+        attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, self_attn_cache)
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states, encoder_attention_mask, input_shape, inputs_embeds
+        )
+        # embed positions
+        position_embeddings = self.pos_embed(inputs_embeds, past_key_values_length, position_ids=cache_position)
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        encoder_hidden_states,
+                        encoder_attention_mask,
+                        output_attentions,
+                    )
+                    raise NotImplementedError("Currently, gradient checkpointing is not supported.")
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=encoder_hidden_states,
+                        encoder_attention_mask=encoder_attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        cross_attn_layer_head_mask=(
+                            cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                        ),
+                        past_key_value=past_key_values,
+                        output_attentions=output_attentions,
+                        use_cache=use_cache,
+                        cache_position=cache_position,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None, None, None)
+            if use_cache:
+                next_decoder_cache = layer_outputs[3 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: Optional[torch.Tensor],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+    ):
+        if self.config._attn_implementation == "flex_attention":
+            raise NotImplementedError
+        if self.config._attn_implementation == "flash_attention_2":
+            raise NotImplementedError
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = True if isinstance(past_key_values, StaticCache) else False
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype,
+                dtype=dtype,
+                device=cache_position.device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                raise NotImplementedError
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+        return encoder_attention_mask

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 3000,
+  "eos_token_id": 3001,
+  "pad_token_id": 3002,
+  "transformers_version": "4.53.3"
+}

modeling_avhubert.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import logging
+from dataclasses import dataclass
+from typing import Optional
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.cache_utils import StaticCache
+from transformers.generation import GenerationMixin
+from transformers.generation.utils import GenerationConfig, GenerationMode
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from transformers.models.hubert.modeling_hubert import (
+    HubertEncoder,
+    HubertEncoderStableLayerNorm,
+)
+from transformers.utils import ModelOutput
+from .configuration_avhubert import AVHubertConfig
+from .configuration_resnet import ResEncoderConfig
+from .decoder import AVHubertDecoder, AVHubertDecoderStableLayerNorm
+from .modeling_resnet import ResEncoder
+logger = logging.getLogger(__name__)
+NEED_SETUP_CACHE_CLASSES_MAPPING = {
+    "static": StaticCache,
+}
+@dataclass
+class AVHubertOutput:
+    last_hidden_state: Optional[torch.Tensor] = None
+    hidden_states: Optional[torch.Tensor] = None
+    attentions: Optional[torch.Tensor] = None
+class AudioFeatureExtractor(nn.Module):
+    def __init__(self, input_dim: int, output_dim: int) -> None:
+        super(AudioFeatureExtractor, self).__init__()
+        self.proj = nn.Linear(in_features=input_dim, out_features=output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)  # [B, T, F]
+        return einops.rearrange(x, "b t f -> b f t")  # [B, F, T]
+class VideoFeatureExtractor(nn.Module):
+    def __init__(self, config: ResEncoderConfig, output_dim: int) -> None:
+        super(VideoFeatureExtractor, self).__init__()
+        self.resnet = ResEncoder(config=config)
+        self.proj = nn.Linear(
+            in_features=self.resnet.backend_out,
+            out_features=output_dim,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.resnet(einops.rearrange(x, "b t c h w -> b c t h w"))  # [B, F, T]
+        x = self.proj(einops.rearrange(x, "b f t -> b t f"))  # [B, T, F]
+        return einops.rearrange(x, "b t f -> b f t")  # [B, F, T]
+class AVHubertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = AVHubertConfig
+    base_model_prefix = "avhubert"
+    supports_gradient_checkpointing = False
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+                if hasattr(module, "weight_v") and hasattr(module, "weight_g"):
+                    with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+                else:
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
+                        nn.init.kaiming_normal_(module.weight.data)
+            else:
+                if hasattr(module, "parametrizations"):
+                    nn.init.kaiming_normal_(module.parametrizations.weight.original0.data)
+                    nn.init.kaiming_normal_(module.parametrizations.weight.original1.data)
+                nn.init.kaiming_normal_(module.weight.data)
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d, nn.Conv3d)) and module.bias is not None:
+            module.bias.data.zero_()
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor | int):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        return input_lengths
+class AVHubertModel(AVHubertPreTrainedModel):
+    def __init__(self, config: AVHubertConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.feat2tar_ratio = config.label_rate / config.sample_rate
+        # feature extractor
+        resnet_config = ResEncoderConfig(relu_type=config.resnet_relu_type)
+        self.feature_extractor_audio = AudioFeatureExtractor(
+            input_dim=config.audio_feat_dim,
+            output_dim=config.encoder_embed_dim,
+        )
+        self.feature_extractor_video = VideoFeatureExtractor(config=resnet_config, output_dim=config.encoder_embed_dim)
+        self.encoder_embed_dim = config.encoder_embed_dim
+        if config.modality_fuse == "concat":
+            embed = config.encoder_embed_dim * 2
+        elif config.modality_fuse == "add":
+            embed = config.encoder_embed_dim
+        self.post_extract_proj = (
+            nn.Linear(embed, config.encoder_embed_dim) if embed != config.encoder_embed_dim else None
+        )
+        # dropout
+        self.dropout_input = nn.Dropout(config.dropout_input)
+        # transformer encoder
+        transformer_config = config.encoder_config
+        if transformer_config.do_stable_layer_norm:
+            self.encoder = HubertEncoderStableLayerNorm(config=transformer_config)
+        else:
+            self.encoder = HubertEncoder(config=transformer_config)
+        self.layer_norm = nn.LayerNorm(embed)
+    def forward_mask(self, features: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        extra = attention_mask.size(1) % features.size(1)
+        if extra > 0:
+            attention_mask = attention_mask[:, :-extra]
+        attention_mask = attention_mask.view(attention_mask.size(0), features.size(1), -1)
+        attention_mask = attention_mask.all(-1)
+        return attention_mask
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        **kwargs,
+    ) -> ModelOutput:
+        if input_values is not None and pixel_values is None:
+            features_audio = self.feature_extractor_audio(input_values)  # [B, F, T]
+            features_video = torch.zeros_like(features_audio)  # [B, F, T]
+        elif input_values is None and pixel_values is not None:
+            features_video = self.feature_extractor_video(pixel_values)  # [B, F, T]
+            features_audio = torch.zeros_like(features_video)  # [B, F, T]
+        elif input_values is not None and pixel_values is not None:
+            features_audio = self.feature_extractor_audio(input_values)  # [B, F, T]
+            features_video = self.feature_extractor_video(pixel_values)  # [B, F, T]
+        else:
+            raise ValueError("Either `input_values` or `pixel_values` must be passed")
+        # fuse modality
+        if self.config.modality_fuse == "concat":
+            features = torch.cat([features_audio, features_video], dim=1)
+        elif self.config.modality_fuse == "add":
+            features = features_audio + features_video
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        if padding_mask is not None:
+            padding_mask = self.forward_mask(features, padding_mask)
+        else:
+            padding_mask = torch.zeros(features.size()[:2], dtype=torch.bool, device=features.device)
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        features = self.dropout_input(features)
+        # transformer encoder
+        encoder_out = self.encoder(
+            hidden_states=features,
+            attention_mask=~padding_mask.bool(),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        return AVHubertOutput(
+            last_hidden_state=encoder_out.last_hidden_state,
+            hidden_states=encoder_out.hidden_states,
+            attentions=encoder_out.attentions,
+        )
+class AVHubertForConditionalGeneration(AVHubertPreTrainedModel, GenerationMixin):
+    def __init__(
+        self,
+        config: AVHubertConfig,
+        **kwargs,
+    ) -> None:
+        super().__init__(config=config, **kwargs)
+        self.config = config
+        self.avhubert = AVHubertModel(config=config)
+        if config.freeze_base_model:
+            self.freeze_base_model()
+        if config.freeze_feature_encoder:
+            self.freeze_feature_encoder()
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `AVHubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.decoder_embed_dim, padding_idx=config.pad_token_id)
+        transformer_config = config.decoder_config
+        if transformer_config.do_stable_layer_norm:
+            self.decoder = AVHubertDecoderStableLayerNorm(config=transformer_config)
+        else:
+            self.decoder = AVHubertDecoder(config=transformer_config)
+        self.lm_head = nn.Linear(config.decoder_embed_dim, config.vocab_size, bias=False)
+        if config.share_decoder_input_output_embed:
+            # If this model shares lm head weights with the token embeddings,
+            # you can access lm head weights that is the same as the token embeddings but
+            # the token embeddings are directly referred to instead of lm heads when training!
+            self.lm_head.weight = self.embed_tokens.weight
+        else:
+            nn.init.normal_(self.lm_head.weight, mean=0, std=config.decoder_embed_dim**-0.5)
+        self.post_init()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        for param in self.avhubert.feature_extractor_audio.parameters():
+            param.requires_grad = False
+        for param in self.avhubert.feature_extractor_video.parameters():
+            param.requires_grad = False
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.avhubert.parameters():
+            param.requires_grad = False
+    def get_encoder(self):
+        return self.avhubert
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> ModelOutput:
+        encoder_outs = self.avhubert(
+            input_values=input_values,
+            pixel_values=pixel_values,
+            padding_mask=padding_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        embed_tokens = self.embed_tokens(decoder_input_ids)
+        hidden_states = self.decoder(
+            inputs_embeds=embed_tokens,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outs.last_hidden_state,
+            encoder_attention_mask=~padding_mask.bool(),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        if self.config.share_decoder_input_output_embed:
+            logits = F.linear(hidden_states.last_hidden_state, weight=self.embed_tokens.weight)
+        else:
+            logits = self.lm_head(hidden_states.last_hidden_state)
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1)
+            loss = loss_fn(logits.view(-1, self.config.vocab_size), labels.reshape(-1))
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            decoder_hidden_states=hidden_states.hidden_states,
+            decoder_attentions=hidden_states.attentions,
+            cross_attentions=None,
+            encoder_last_hidden_state=encoder_outs.last_hidden_state,
+            encoder_hidden_states=encoder_outs.hidden_states,
+            encoder_attentions=encoder_outs.attentions,
+        )
+    def _get_generation_mode(
+        self,
+        generation_config: GenerationConfig,
+        assistant_model: PreTrainedModel | None,
+    ) -> GenerationMode:
+        """
+        Returns the generation mode triggered by a [`GenerationConfig`] instance.
+        """
+        if generation_config.constraints is not None or generation_config.force_words_ids is not None:
+            generation_mode = GenerationMode.CONSTRAINED_BEAM_SEARCH
+        elif generation_config.num_beams == 1:
+            if generation_config.do_sample is False:
+                if (
+                    generation_config.top_k is not None
+                    and generation_config.top_k > 1
+                    and generation_config.penalty_alpha is not None
+                    and generation_config.penalty_alpha > 0
+                ):
+                    generation_mode = GenerationMode.CONTRASTIVE_SEARCH
+                else:
+                    generation_mode = GenerationMode.GREEDY_SEARCH
+            else:
+                generation_mode = GenerationMode.SAMPLE
+        else:
+            if generation_config.num_beam_groups > 1:
+                generation_mode = GenerationMode.GROUP_BEAM_SEARCH
+            elif generation_config.do_sample is True:
+                generation_mode = GenerationMode.BEAM_SAMPLE
+            else:
+                generation_mode = GenerationMode.BEAM_SEARCH
+        # Assisted generation may extend some generation modes
+        if assistant_model is not None or generation_config.prompt_lookup_num_tokens is not None:
+            if generation_mode in ("greedy_search", "sample"):
+                generation_mode = GenerationMode.ASSISTED_GENERATION
+            else:
+                raise ValueError(
+                    "You've set `assistant_model`, which triggers assisted generate. Currently, assisted generate "
+                    "is only supported with Greedy Search and Sample."
+                )
+        return generation_mode
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.Tensor = None,
+        input_values: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if decoder_input_ids is None:
+            decoder_input_ids = input_ids
+            decoder_attention_mask = torch.ones_like(input_ids)
+        return {
+            "input_values": input_values,
+            "pixel_values": pixel_values,
+            "decoder_input_ids": decoder_input_ids,
+            "decoder_attention_mask": decoder_attention_mask,
+            "padding_mask": padding_mask,
+        }

modeling_resnet.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import math
+import torch.nn as nn
+from transformers import PreTrainedModel
+from .configuration_resnet import ResEncoderConfig
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+def downsample_basic_block(inplanes, outplanes, stride):
+    return nn.Sequential(
+        nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, bias=False),
+        nn.BatchNorm2d(outplanes),
+    )
+def downsample_basic_block_v2(inplanes, outplanes, stride):
+    return nn.Sequential(
+        nn.AvgPool2d(
+            kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False
+        ),
+        nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, bias=False),
+        nn.BatchNorm2d(outplanes),
+    )
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, relu_type="relu"):
+        super(BasicBlock, self).__init__()
+        assert relu_type in ["relu", "prelu"]
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        if relu_type == "relu":
+            self.relu1 = nn.ReLU(inplace=True)
+            self.relu2 = nn.ReLU(inplace=True)
+        elif relu_type == "prelu":
+            self.relu1 = nn.PReLU(num_parameters=planes)
+            self.relu2 = nn.PReLU(num_parameters=planes)
+        else:
+            raise Exception("relu type not implemented")
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu1(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu2(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block,
+        layers,
+        num_classes=1000,
+        relu_type="relu",
+        gamma_zero=False,
+        avg_pool_downsample=False,
+    ):
+        self.inplanes = 64
+        self.relu_type = relu_type
+        self.gamma_zero = gamma_zero
+        self.downsample_block = (
+            downsample_basic_block_v2 if avg_pool_downsample else downsample_basic_block
+        )
+        super(ResNet, self).__init__()
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        if self.gamma_zero:
+            for m in self.modules():
+                if isinstance(m, BasicBlock):
+                    m.bn2.weight.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = self.downsample_block(
+                inplanes=self.inplanes,
+                outplanes=planes * block.expansion,
+                stride=stride,
+            )
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, relu_type=self.relu_type)
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, relu_type=self.relu_type))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        return x
+class ResEncoder(PreTrainedModel):
+    def __init__(self, config: ResEncoderConfig):
+        super(ResEncoder, self).__init__(config=config)
+        self.frontend_nout = config.frontend_nout
+        self.backend_out = config.backend_out
+        frontend_relu = (
+            nn.PReLU(num_parameters=self.frontend_nout)
+            if config.relu_type == "prelu"
+            else nn.ReLU()
+        )
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(
+                1,
+                self.frontend_nout,
+                kernel_size=(5, 7, 7),
+                stride=(1, 2, 2),
+                padding=(2, 3, 3),
+                bias=False,
+            ),
+            nn.BatchNorm3d(self.frontend_nout),
+            frontend_relu,
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)),
+        )
+        self.trunk = ResNet(BasicBlock, [2, 2, 2, 2], relu_type=config.relu_type)
+    def forward(self, x):
+        B, C, T, H, W = x.size()
+        x = self.frontend3D(x)
+        Tnew = x.shape[2]
+        x = self.threeD_to_2D_tensor(x)
+        x = self.trunk(x)
+        x = x.view(B, Tnew, x.size(1))
+        x = x.transpose(1, 2).contiguous()
+        return x
+    def threeD_to_2D_tensor(self, x):
+        n_batch, n_channels, s_time, sx, sy = x.shape
+        x = x.transpose(1, 2).contiguous()
+        return x.reshape(n_batch * s_time, n_channels, sx, sy)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35ee1a95844cd8f2f45822d0c8c5f167727337bc5a616e95a02b4b0a4341ca2b
+size 653053499