diff --git a/docs/transformers/build/lib/transformers/models/visual_bert/configuration_visual_bert.py b/docs/transformers/build/lib/transformers/models/visual_bert/configuration_visual_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a866227d3470b9b63f5ca7f2afad458b90cbd11d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/visual_bert/configuration_visual_bert.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VisualBERT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VisualBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VisualBertModel`]. It is used to instantiate an
+    VisualBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the VisualBERT
+    [uclanlp/visualbert-vqa-coco-pre](https://huggingface.co/uclanlp/visualbert-vqa-coco-pre) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the VisualBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`VisualBertModel`]. Vocabulary size of the model. Defines the
+            different tokens that can be represented by the `inputs_ids` passed to the forward method of
+            [`VisualBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        visual_embedding_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the visual embeddings to be passed to the model.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`VisualBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bypass_transformer (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should bypass the transformer for the visual embeddings. If set to `True`, the
+            model directly concatenates the visual embeddings from [`VisualBertEmbeddings`] with text output from
+            transformers, and then pass it to a self-attention layer.
+        special_visual_initialize (`bool`, *optional*, defaults to `True`):
+            Whether or not the visual token type and position type embedding weights should be initialized the same as
+            the textual token type and positive type embeddings. When set to `True`, the weights of the textual token
+            type and position type embeddings are copied to the respective visual embedding layers.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import VisualBertConfig, VisualBertModel
+
+    >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
+    >>> configuration = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+
+    >>> # Initializing a model (with random weights) from the visualbert-vqa-coco-pre style configuration
+    >>> model = VisualBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "visual_bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        visual_embedding_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        bypass_transformer=False,
+        special_visual_initialize=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.visual_embedding_dim = visual_embedding_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.bypass_transformer = bypass_transformer
+        self.special_visual_initialize = special_visual_initialize
+
+
+__all__ = ["VisualBertConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/visual_bert/modeling_visual_bert.py b/docs/transformers/build/lib/transformers/models/visual_bert/modeling_visual_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d81258a89c4118412f43d88fb8cf49ad67a9de6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/visual_bert/modeling_visual_bert.py
@@ -0,0 +1,1597 @@
+# coding=utf-8
+# Copyright 2021 The UCLA NLP Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VisualBERT model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, KLDivLoss, LogSoftmax
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MultipleChoiceModelOutput,
+    SequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_visual_bert import VisualBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VisualBertConfig"
+_CHECKPOINT_FOR_DOC = "uclanlp/visualbert-vqa-coco-pre"
+
+
+class VisualBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings and visual embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+        # For Visual Features
+        # Token type and position embedding for image features
+        self.visual_token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.visual_position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        if config.special_visual_initialize:
+            self.visual_token_type_embeddings.weight.data = nn.Parameter(
+                self.token_type_embeddings.weight.data.clone(), requires_grad=True
+            )
+            self.visual_position_embeddings.weight.data = nn.Parameter(
+                self.position_embeddings.weight.data.clone(), requires_grad=True
+            )
+
+        self.visual_projection = nn.Linear(config.visual_embedding_dim, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        visual_embeds=None,
+        visual_token_type_ids=None,
+        image_text_alignment=None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        # Absolute Position Embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+
+        if visual_embeds is not None:
+            if visual_token_type_ids is None:
+                visual_token_type_ids = torch.ones(
+                    visual_embeds.size()[:-1], dtype=torch.long, device=self.position_ids.device
+                )
+
+            visual_embeds = self.visual_projection(visual_embeds)
+            visual_token_type_embeddings = self.visual_token_type_embeddings(visual_token_type_ids)
+
+            if image_text_alignment is not None:
+                # image_text_alignment = Batch x image_length x alignment_number.
+                # Each element denotes the position of the word corresponding to the image feature. -1 is the padding value.
+
+                dtype = token_type_embeddings.dtype
+                image_text_alignment_mask = (image_text_alignment != -1).long()
+                # Get rid of the -1.
+                image_text_alignment = image_text_alignment_mask * image_text_alignment
+
+                # Batch x image_length x alignment length x dim
+                visual_position_embeddings = self.position_embeddings(image_text_alignment)
+                visual_position_embeddings *= image_text_alignment_mask.to(dtype=dtype).unsqueeze(-1)
+                visual_position_embeddings = visual_position_embeddings.sum(2)
+
+                # We want to averge along the alignment_number dimension.
+                image_text_alignment_mask = image_text_alignment_mask.to(dtype=dtype).sum(2)
+
+                if (image_text_alignment_mask == 0).sum() != 0:
+                    image_text_alignment_mask[image_text_alignment_mask == 0] = 1  # Avoid divide by zero error
+                    logger.warning(
+                        "Found 0 values in `image_text_alignment_mask`. Setting them to 1 to avoid divide-by-zero"
+                        " error."
+                    )
+                visual_position_embeddings = visual_position_embeddings / image_text_alignment_mask.unsqueeze(-1)
+
+                visual_position_ids = torch.zeros(
+                    *visual_embeds.size()[:-1], dtype=torch.long, device=visual_embeds.device
+                )
+
+                # When fine-tuning the detector , the image_text_alignment is sometimes padded too long.
+                if visual_position_embeddings.size(1) != visual_embeds.size(1):
+                    if visual_position_embeddings.size(1) < visual_embeds.size(1):
+                        raise ValueError(
+                            f"Visual position embeddings length: {visual_position_embeddings.size(1)} "
+                            f"should be the same as `visual_embeds` length: {visual_embeds.size(1)}"
+                        )
+                    visual_position_embeddings = visual_position_embeddings[:, : visual_embeds.size(1), :]
+
+                visual_position_embeddings = visual_position_embeddings + self.visual_position_embeddings(
+                    visual_position_ids
+                )
+            else:
+                visual_position_ids = torch.zeros(
+                    *visual_embeds.size()[:-1], dtype=torch.long, device=visual_embeds.device
+                )
+                visual_position_embeddings = self.visual_position_embeddings(visual_position_ids)
+
+            visual_embeddings = visual_embeds + visual_position_embeddings + visual_token_type_embeddings
+
+            embeddings = torch.cat((embeddings, visual_embeddings), dim=1)
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class VisualBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in VisualBertSelfAttentionModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->VisualBert
+class VisualBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class VisualBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = VisualBertSelfAttention(config)
+        self.output = VisualBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->VisualBert
+class VisualBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->VisualBert
+class VisualBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class VisualBertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VisualBertAttention(config)
+        self.intermediate = VisualBertIntermediate(config)
+        self.output = VisualBertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class VisualBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VisualBertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->VisualBert
+class VisualBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->VisualBert
+class VisualBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->VisualBert
+class VisualBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = VisualBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->VisualBert
+class VisualBertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = VisualBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class VisualBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VisualBertConfig
+    base_model_prefix = "visual_bert"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, VisualBertLMPredictionHead):
+            module.bias.data.zero_()
+
+
+@dataclass
+class VisualBertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`VisualBertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the sentence-image prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the sentence-image prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+VISUAL_BERT_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`VisualBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VISUAL_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+
+        visual_embeds (`torch.FloatTensor` of shape `(batch_size, visual_seq_length, visual_embedding_dim)`, *optional*):
+            The embedded representation of the visual inputs, generally derived using using an object detector.
+
+        visual_attention_mask (`torch.FloatTensor` of shape `(batch_size, visual_seq_length)`, *optional*):
+            Mask to avoid performing attention on visual embeddings. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        visual_token_type_ids (`torch.LongTensor` of shape `(batch_size, visual_seq_length)`, *optional*):
+            Segment token indices to indicate different portions of the visual embeds.
+
+            [What are token type IDs?](../glossary#token-type-ids) The authors of VisualBERT set the
+            *visual_token_type_ids* to *1* for all tokens.
+
+        image_text_alignment (`torch.LongTensor` of shape `(batch_size, visual_seq_length, alignment_number)`, *optional*):
+            Image-Text alignment uses to decide the position IDs of the visual embeddings.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VisualBert Model transformer outputting raw hidden-states without any specific head on top.",
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertModel(VisualBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VisualBertEmbeddings(config)
+        self.encoder = VisualBertEncoder(config)
+
+        self.pooler = VisualBertPooler(config) if add_pooling_layer else None
+
+        self.bypass_transformer = config.bypass_transformer
+
+        if self.bypass_transformer:
+            self.additional_layer = VisualBertLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]:
+        r"""
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image.
+        from transformers import AutoTokenizer, VisualBertModel
+        import torch
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+
+        inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+
+        outputs = model(**inputs)
+
+        last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if visual_embeds is not None:
+            visual_input_shape = visual_embeds.size()[:-1]
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+
+        if visual_embeds is not None and visual_attention_mask is None:
+            visual_attention_mask = torch.ones(visual_input_shape, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if visual_embeds is not None:
+            combined_attention_mask = torch.cat((attention_mask, visual_attention_mask), dim=-1)
+            extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+                combined_attention_mask, (batch_size, input_shape + visual_input_shape)
+            )
+
+        else:
+            extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+                attention_mask, (batch_size, input_shape)
+            )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+        )
+
+        if self.bypass_transformer and visual_embeds is not None:
+            text_length = input_ids.size(1)
+            text_embedding_output = embedding_output[:, :text_length, :]
+            visual_embedding_output = embedding_output[:, text_length:, :]
+
+            text_extended_attention_mask = extended_attention_mask[:, :, text_length, :text_length]
+
+            encoded_outputs = self.encoder(
+                text_embedding_output,
+                attention_mask=text_extended_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = encoded_outputs[0]
+            concatenated_input = torch.cat((sequence_output, visual_embedding_output), dim=1)
+            sequence_output = self.additional_layer(concatenated_input, extended_attention_mask)
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        else:
+            encoder_outputs = self.encoder(
+                embedding_output,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output = encoder_outputs[0]
+
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    VisualBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
+    `sentence-image prediction (classification)` head.
+    """,
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertForPreTraining(VisualBertPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.visual_bert = VisualBertModel(config)
+        self.cls = VisualBertPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=VisualBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        sentence_image_labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], VisualBertForPreTrainingOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, total_sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        sentence_image_labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sentence-image prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a matching pair of sequence A for the given image,
+            - 1 indicates sequence B is a random sequence w.r.t A for the given image.
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
+        from transformers import AutoTokenizer, VisualBertForPreTraining
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+
+        inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+        max_length = inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]
+        labels = tokenizer(
+            "The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length
+        )["input_ids"]
+        sentence_image_labels = torch.tensor(1).unsqueeze(0)  # Batch_size
+
+
+        outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
+        loss = outputs.loss
+        prediction_logits = outputs.prediction_logits
+        seq_relationship_logits = outputs.seq_relationship_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            total_size = attention_mask.size(-1) + visual_attention_mask.size(-1)
+            if labels.size(-1) != total_size:
+                raise ValueError(
+                    "The labels provided should have same sequence length as total attention mask. "
+                    f"Found labels with sequence length {labels.size(-1)}, expected {total_size}."
+                )
+
+        outputs = self.visual_bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and sentence_image_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            sentence_image_loss = loss_fct(seq_relationship_score.view(-1, 2), sentence_image_labels.view(-1))
+            total_loss = masked_lm_loss + sentence_image_loss
+
+        elif labels is not None:
+            loss_fct = CrossEntropyLoss()
+            total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return VisualBertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    VisualBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for VCR tasks.
+    """,
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.visual_bert = VisualBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.cls = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @replace_return_docstrings(output_type=MultipleChoiceModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
+        from transformers import AutoTokenizer, VisualBertForMultipleChoice
+        import torch
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
+
+        prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        choice0 = "It is eaten with a fork and a knife."
+        choice1 = "It is eaten while held in the hand."
+
+        visual_embeds = get_visual_embeddings(image)
+        # (batch_size, num_choices, visual_seq_length, visual_embedding_dim)
+        visual_embeds = visual_embeds.expand(1, 2, *visual_embeds.shape)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+
+        labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+        encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True)
+        # batch size is 1
+        inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
+        inputs_dict.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_attention_mask": visual_attention_mask,
+                "visual_token_type_ids": visual_token_type_ids,
+                "labels": labels,
+            }
+        )
+        outputs = model(**inputs_dict)
+
+        loss = outputs.loss
+        logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        visual_embeds = (
+            visual_embeds.view(-1, visual_embeds.size(-2), visual_embeds.size(-1))
+            if visual_embeds is not None
+            else None
+        )
+        visual_attention_mask = (
+            visual_attention_mask.view(-1, visual_attention_mask.size(-1))
+            if visual_attention_mask is not None
+            else None
+        )
+        visual_token_type_ids = (
+            visual_token_type_ids.view(-1, visual_token_type_ids.size(-1))
+            if visual_token_type_ids is not None
+            else None
+        )
+
+        outputs = self.visual_bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        _, pooled_output = outputs[0], outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.cls(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    VisualBert Model with a classification/regression head on top (a dropout and a linear layer on top of the pooled
+    output) for VQA.
+    """,
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.visual_bert = VisualBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.cls = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, total_sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. A KLDivLoss is computed between the labels and the returned logits.
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
+        from transformers import AutoTokenizer, VisualBertForQuestionAnswering
+        import torch
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
+
+        text = "Who is eating the apple?"
+        inputs = tokenizer(text, return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+
+        labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2
+
+        outputs = model(**inputs, labels=labels)
+        loss = outputs.loss
+        scores = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Get the index of the last text token
+        index_to_gather = attention_mask.sum(1) - 2  # as in original code
+
+        outputs = self.visual_bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # TO-CHECK: From the original code
+        index_to_gather = (
+            index_to_gather.unsqueeze(-1).unsqueeze(-1).expand(index_to_gather.size(0), 1, sequence_output.size(-1))
+        )
+        pooled_output = torch.gather(sequence_output, 1, index_to_gather)
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.cls(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_labels)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.KLDivLoss(reduction="batchmean")
+            log_softmax = nn.LogSoftmax(dim=-1)
+            reshaped_logits = log_softmax(reshaped_logits)
+            loss = loss_fct(reshaped_logits, labels.contiguous())
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    VisualBert Model with a sequence classification head on top (a dropout and a linear layer on top of the pooled
+    output) for Visual Reasoning e.g. for NLVR task.
+    """,
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.visual_bert = VisualBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.cls = nn.Linear(config.hidden_size, config.num_labels)  # 2
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. A classification loss is computed (Cross-Entropy) against these labels.
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
+        from transformers import AutoTokenizer, VisualBertForVisualReasoning
+        import torch
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
+
+        text = "Who is eating the apple?"
+        inputs = tokenizer(text, return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
+
+        outputs = model(**inputs, labels=labels)
+        loss = outputs.loss
+        scores = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.visual_bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # sequence_output = outputs[0]
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.cls(pooled_output)
+        reshaped_logits = logits.contiguous()
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class VisualBertRegionToPhraseAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = 1  # config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, query, key, attention_mask):
+        attention_mask = attention_mask.to(query.dtype)
+        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        attention_mask = (1.0 - attention_mask) * torch.finfo(query.dtype).min
+
+        mixed_query_layer = self.query(query)
+        mixed_key_layer = self.key(key)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        attention_scores = attention_scores + attention_mask
+
+        attention_scores = attention_scores.squeeze(1)
+        return attention_scores
+
+
+@add_start_docstrings(
+    """
+    VisualBert Model with a Masked Language Modeling head and an attention layer on top for Region-to-Phrase Alignment
+    e.g. for Flickr30 Entities task.
+    """,
+    VISUAL_BERT_START_DOCSTRING,
+)
+class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.visual_bert = VisualBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.cls = VisualBertPreTrainingHeads(config)
+        self.attention = VisualBertRegionToPhraseAttention(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VISUAL_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        visual_embeds: Optional[torch.FloatTensor] = None,
+        visual_attention_mask: Optional[torch.LongTensor] = None,
+        visual_token_type_ids: Optional[torch.LongTensor] = None,
+        image_text_alignment: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        region_to_phrase_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        region_to_phrase_position (`torch.LongTensor` of shape `(batch_size, total_sequence_length)`, *optional*):
+            The positions depicting the position of the image embedding corresponding to the textual tokens.
+
+        labels (`torch.LongTensor` of shape `(batch_size, total_sequence_length, visual_sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. KLDivLoss is computed against these labels and the
+            outputs from the attention layer.
+
+        Returns:
+
+        Example:
+
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
+        from transformers import AutoTokenizer, VisualBertForRegionToPhraseAlignment
+        import torch
+
+        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
+
+        text = "Who is eating the apple?"
+        inputs = tokenizer(text, return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]))
+
+        inputs.update(
+            {
+                "region_to_phrase_position": region_to_phrase_position,
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+
+        labels = torch.ones(
+            (1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2], visual_embeds.shape[-2])
+        )  # Batch size 1
+
+        outputs = model(**inputs, labels=labels)
+        loss = outputs.loss
+        scores = outputs.logits
+        ```"""
+        if region_to_phrase_position is None:
+            raise ValueError("`region_to_phrase_position` should not be None when using Flickr Model.")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.visual_bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            visual_embeds=visual_embeds,
+            visual_attention_mask=visual_attention_mask,
+            visual_token_type_ids=visual_token_type_ids,
+            image_text_alignment=image_text_alignment,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        region_to_phrase_position_mask = (region_to_phrase_position != -1).long()
+
+        # Make the -1 become 0
+        region_to_phrase_position = region_to_phrase_position * region_to_phrase_position_mask
+
+        # Selected_positions = batch x selected position x dim
+        expanded_region_to_phrase_positions = region_to_phrase_position.unsqueeze(2).expand(
+            region_to_phrase_position.size(0), region_to_phrase_position.size(1), sequence_output.size(2)
+        )
+        selected_positions = sequence_output.gather(1, expanded_region_to_phrase_positions)
+
+        # Visual Features = batch x visual_feature_length x dim
+        # This will need separate image and visual masks.
+        visual_features = sequence_output[:, attention_mask.size(1) :]
+
+        if visual_features.size(1) != visual_attention_mask.size(1):
+            raise ValueError(
+                f"Visual features length :{visual_features.size(1)} should be the same"
+                f" as visual attention mask length: {visual_attention_mask.size(1)}."
+            )
+
+        logits = self.attention(selected_positions, visual_features, visual_attention_mask)
+
+        loss = None
+
+        if labels is not None:
+            # scores = batch x selected position x visual_feature
+            # scores = selected_positions.bmm(visual_features.transpose(1,2))
+            # label = batch x selected_postion x needed position
+            loss_fct = KLDivLoss(reduction="batchmean")
+            log_softmax = LogSoftmax(dim=-1)
+            scores = log_softmax(logits)
+            labels = labels.contiguous()
+            loss = loss_fct(scores, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "VisualBertForMultipleChoice",
+    "VisualBertForPreTraining",
+    "VisualBertForQuestionAnswering",
+    "VisualBertForRegionToPhraseAlignment",
+    "VisualBertForVisualReasoning",
+    "VisualBertLayer",
+    "VisualBertModel",
+    "VisualBertPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/vit/__init__.py b/docs/transformers/build/lib/transformers/models/vit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d6a7a23fa63f4c95102584b90d7f775b746ce49
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vit import *
+    from .feature_extraction_vit import *
+    from .image_processing_vit import *
+    from .image_processing_vit_fast import *
+    from .modeling_flax_vit import *
+    from .modeling_tf_vit import *
+    from .modeling_vit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vit/configuration_vit.py b/docs/transformers/build/lib/transformers/models/vit/configuration_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ad3a7715c5dd411147e33823e0d69e7106005d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/configuration_vit.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2021 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTModel`]. It is used to instantiate an ViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ViT
+    [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        encoder_stride (`int`, *optional*, defaults to 16):
+           Factor to increase the spatial resolution by in the decoder head for masked image modeling.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTConfig, ViTModel
+
+    >>> # Initializing a ViT vit-base-patch16-224 style configuration
+    >>> configuration = ViTConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-base-patch16-224 style configuration
+    >>> model = ViTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        encoder_stride=16,
+        pooler_output_size=None,
+        pooler_act="tanh",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.encoder_stride = encoder_stride
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
+
+
+class ViTOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["ViTConfig", "ViTOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/convert_dino_to_pytorch.py b/docs/transformers/build/lib/transformers/models/vit/convert_dino_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..8608da8eb411644ab214666560cccae5a9285213
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/convert_dino_to_pytorch.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT checkpoints trained with the DINO method."""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "vit.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "vit.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vit" from all keys that start with "vit"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vit.layernorm.weight"),
+                ("norm.bias", "vit.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "vit."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_vit_checkpoint(model_name, pytorch_dump_folder_path, base_model=True):
+    """
+    Copy/paste/tweak model's weights to our ViT structure.
+    """
+
+    # define default ViT configuration
+    config = ViTConfig()
+    # patch_size
+    if model_name[-1] == "8":
+        config.patch_size = 8
+    # set labels if required
+    if not base_model:
+        config.num_labels = 1000
+        repo_id = "huggingface/label-files"
+        filename = "imagenet-1k-id2label.json"
+        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
+    # size of the architecture
+    if model_name in ["dino_vits8", "dino_vits16"]:
+        config.hidden_size = 384
+        config.intermediate_size = 1536
+        config.num_hidden_layers = 12
+        config.num_attention_heads = 6
+
+    # load original model from torch hub
+    original_model = torch.hub.load("facebookresearch/dino:main", model_name)
+    original_model.eval()
+
+    # load state_dict of original model, remove and rename some keys
+    state_dict = original_model.state_dict()
+    if base_model:
+        remove_classification_head_(state_dict)
+    rename_keys = create_rename_keys(config, base_model=base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+
+    # load HuggingFace model
+    if base_model:
+        model = ViTModel(config, add_pooling_layer=False).eval()
+    else:
+        model = ViTForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by ViTImageProcessor
+    image_processor = ViTImageProcessor()
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    if base_model:
+        final_hidden_state_cls_token = original_model(pixel_values)
+        assert torch.allclose(final_hidden_state_cls_token, outputs.last_hidden_state[:, 0, :], atol=1e-1)
+    else:
+        logits = original_model(pixel_values)
+        assert logits.shape == outputs.logits.shape
+        assert torch.allclose(logits, outputs.logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="dino_vitb16",
+        type=str,
+        help="Name of the model trained with DINO you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--base_model",
+        action="store_true",
+        help="Whether to only convert the base model (no projection head weights).",
+    )
+
+    parser.set_defaults(base_model=True)
+    args = parser.parse_args()
+    convert_vit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.base_model)
diff --git a/docs/transformers/build/lib/transformers/models/vit/convert_vit_timm_to_pytorch.py b/docs/transformers/build/lib/transformers/models/vit/convert_vit_timm_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7892842f8dc18d4eea990acc6a4bac31ff916919
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT and non-distilled DeiT checkpoints from the timm library."""
+
+import argparse
+from pathlib import Path
+
+import requests
+import timm
+import torch
+from PIL import Image
+from timm.data import ImageNetInfo, infer_imagenet_subset
+
+from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("cls_token", "vit.embeddings.cls_token"),
+            ("patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+            ("patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+            ("pos_embed", "vit.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm
+        rename_keys.extend(
+            [
+                ("norm.weight", "layernorm.weight"),
+                ("norm.bias", "layernorm.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vit" from all keys that start with "vit"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vit.layernorm.weight"),
+                ("norm.bias", "vit.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "vit."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak model's weights to our ViT structure.
+    """
+
+    # define default ViT configuration
+    config = ViTConfig()
+    base_model = False
+
+    # load original model from timm
+    timm_model = timm.create_model(vit_name, pretrained=True)
+    timm_model.eval()
+
+    # detect unsupported ViT models in transformers
+    # fc_norm is present
+    if not isinstance(getattr(timm_model, "fc_norm", None), torch.nn.Identity):
+        raise ValueError(f"{vit_name} is not supported in transformers because of the presence of fc_norm.")
+
+    # use of global average pooling in combination (or without) class token
+    if getattr(timm_model, "global_pool", None) == "avg":
+        raise ValueError(f"{vit_name} is not supported in transformers because of use of global average pooling.")
+
+    # CLIP style vit with norm_pre layer present
+    if "clip" in vit_name and not isinstance(getattr(timm_model, "norm_pre", None), torch.nn.Identity):
+        raise ValueError(
+            f"{vit_name} is not supported in transformers because it's a CLIP style ViT with norm_pre layer."
+        )
+
+    # SigLIP style vit with attn_pool layer present
+    if "siglip" in vit_name and getattr(timm_model, "global_pool", None) == "map":
+        raise ValueError(
+            f"{vit_name} is not supported in transformers because it's a SigLIP style ViT with attn_pool."
+        )
+
+    # use of layer scale in ViT model blocks
+    if not isinstance(getattr(timm_model.blocks[0], "ls1", None), torch.nn.Identity) or not isinstance(
+        getattr(timm_model.blocks[0], "ls2", None), torch.nn.Identity
+    ):
+        raise ValueError(f"{vit_name} is not supported in transformers because it uses a layer scale in its blocks.")
+
+    # Hybrid ResNet-ViTs
+    if not isinstance(timm_model.patch_embed, timm.layers.PatchEmbed):
+        raise ValueError(f"{vit_name} is not supported in transformers because it is a hybrid ResNet-ViT.")
+
+    # get patch size and image size from the patch embedding submodule
+    config.patch_size = timm_model.patch_embed.patch_size[0]
+    config.image_size = timm_model.patch_embed.img_size[0]
+
+    # retrieve architecture-specific parameters from the timm model
+    config.hidden_size = timm_model.embed_dim
+    config.intermediate_size = timm_model.blocks[0].mlp.fc1.out_features
+    config.num_hidden_layers = len(timm_model.blocks)
+    config.num_attention_heads = timm_model.blocks[0].attn.num_heads
+
+    # check whether the model has a classification head or not
+    if timm_model.num_classes != 0:
+        config.num_labels = timm_model.num_classes
+        # infer ImageNet subset from timm model
+        imagenet_subset = infer_imagenet_subset(timm_model)
+        dataset_info = ImageNetInfo(imagenet_subset)
+        config.id2label = {i: dataset_info.index_to_label_name(i) for i in range(dataset_info.num_classes())}
+        config.label2id = {v: k for k, v in config.id2label.items()}
+    else:
+        print(f"{vit_name} is going to be converted as a feature extractor only.")
+        base_model = True
+
+    # load state_dict of original model
+    state_dict = timm_model.state_dict()
+
+    # remove and rename some keys in the state dict
+    if base_model:
+        remove_classification_head_(state_dict)
+    rename_keys = create_rename_keys(config, base_model)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model)
+
+    # load HuggingFace model
+    if base_model:
+        model = ViTModel(config, add_pooling_layer=False).eval()
+    else:
+        model = ViTForImageClassification(config).eval()
+    model.load_state_dict(state_dict)
+
+    # Check outputs on an image, prepared by ViTImageProcessor/DeiTImageProcessor
+    if "deit" in vit_name:
+        image_processor = DeiTImageProcessor(size=config.image_size)
+    else:
+        image_processor = ViTImageProcessor(size=config.image_size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    pixel_values = encoding["pixel_values"]
+    outputs = model(pixel_values)
+
+    if base_model:
+        timm_pooled_output = timm_model.forward_features(pixel_values)
+        assert timm_pooled_output.shape == outputs.last_hidden_state.shape
+        assert torch.allclose(timm_pooled_output, outputs.last_hidden_state, atol=1e-1)
+    else:
+        timm_logits = timm_model(pixel_values)
+        assert timm_logits.shape == outputs.logits.shape
+        assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {vit_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--vit_name",
+        default="vit_base_patch16_224",
+        type=str,
+        help="Name of the ViT timm model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vit_checkpoint(args.vit_name, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/vit/feature_extraction_vit.py b/docs/transformers/build/lib/transformers/models/vit/feature_extraction_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a93df0bd8f82c81a715f78713a27b17672ecf6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/feature_extraction_vit.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ViT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_vit import ViTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class ViTFeatureExtractor(ViTImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use ViTImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["ViTFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/image_processing_vit.py b/docs/transformers/build/lib/transformers/models/vit/image_processing_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade7495b1d4a690a0f41319aed6bde3a5078de77
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/image_processing_vit.py
@@ -0,0 +1,288 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViT."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class ViTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["ViTImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/image_processing_vit_fast.py b/docs/transformers/build/lib/transformers/models/vit/image_processing_vit_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..61277792cdaa9dda4550221baeb4addae4b8c49e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/image_processing_vit_fast.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for ViT."""
+
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BaseImageProcessorFast,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    PILImageResampling,
+)
+from ...utils import (
+    add_start_docstrings,
+)
+
+
+@add_start_docstrings(
+    "Constructs a fast ViT image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
+class ViTImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 224, "width": 224}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+
+
+__all__ = ["ViTImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/modeling_flax_vit.py b/docs/transformers/build/lib/transformers/models/vit/modeling_flax_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cf3477b5ddff66caac2b84ed6ba2f0e00336780
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/modeling_flax_vit.py
@@ -0,0 +1,677 @@
+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxBaseModelOutputWithPooling, FlaxSequenceClassifierOutput
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_vit import ViTConfig
+
+
+VIT_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxViTPatchEmbeddings(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        image_size = self.config.image_size
+        patch_size = self.config.patch_size
+        num_patches = (image_size // patch_size) * (image_size // patch_size)
+        self.num_patches = num_patches
+        self.num_channels = self.config.num_channels
+        self.projection = nn.Conv(
+            self.config.hidden_size,
+            kernel_size=(patch_size, patch_size),
+            strides=(patch_size, patch_size),
+            padding="VALID",
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    def __call__(self, pixel_values):
+        num_channels = pixel_values.shape[-1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        batch_size, _, _, channels = embeddings.shape
+        return jnp.reshape(embeddings, (batch_size, -1, channels))
+
+
+class FlaxViTEmbeddings(nn.Module):
+    """Construct the CLS token, position and patch embeddings."""
+
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.cls_token = self.param(
+            "cls_token",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, 1, self.config.hidden_size),
+        )
+        self.patch_embeddings = FlaxViTPatchEmbeddings(self.config, dtype=self.dtype)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = self.param(
+            "position_embeddings",
+            jax.nn.initializers.variance_scaling(self.config.initializer_range**2, "fan_in", "truncated_normal"),
+            (1, num_patches + 1, self.config.hidden_size),
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, pixel_values, deterministic=True):
+        batch_size = pixel_values.shape[0]
+
+        embeddings = self.patch_embeddings(pixel_values)
+
+        cls_tokens = jnp.broadcast_to(self.cls_token, (batch_size, 1, self.config.hidden_size))
+        embeddings = jnp.concatenate((cls_tokens, embeddings), axis=1)
+        embeddings = embeddings + self.position_embeddings
+        embeddings = self.dropout(embeddings, deterministic=deterministic)
+        return embeddings
+
+
+class FlaxViTSelfAttention(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads`:"
+                " {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, mode="fan_in", distribution="truncated_normal"
+            ),
+            use_bias=self.config.qkv_bias,
+        )
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+
+        query_states = self.query(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        value_states = self.value(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+        key_states = self.key(hidden_states).reshape(
+            hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim)
+        )
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxViTSelfOutput(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxViTAttention(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.attention = FlaxViTSelfAttention(self.config, dtype=self.dtype)
+        self.output = FlaxViTSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True, output_attentions: bool = False):
+        attn_outputs = self.attention(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+class FlaxViTIntermediate(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class FlaxViTOutput(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = hidden_states + attention_output
+        return hidden_states
+
+
+class FlaxViTLayer(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxViTAttention(self.config, dtype=self.dtype)
+        self.intermediate = FlaxViTIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxViTOutput(self.config, dtype=self.dtype)
+        self.layernorm_before = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.layernorm_after = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
+        attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attention_outputs[0]
+
+        # first residual connection
+        attention_output = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(attention_output)
+
+        hidden_states = self.intermediate(layer_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+        return outputs
+
+
+class FlaxViTLayerCollection(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxViTLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(hidden_states, deterministic=deterministic, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class FlaxViTEncoder(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layer = FlaxViTLayerCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class FlaxViTPooler(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.pooler_output_size,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.pooler_act]
+
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return self.activation(cls_hidden_state)
+
+
+class FlaxViTPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: ViTConfig,
+        input_shape=None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.image_size, config.image_size, config.num_channels)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        pixel_values = jnp.zeros(input_shape, dtype=self.dtype)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, pixel_values, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        pixel_values,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(pixel_values, dtype=jnp.float32),
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+class FlaxViTModule(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+
+    def setup(self):
+        self.embeddings = FlaxViTEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxViTEncoder(self.config, dtype=self.dtype)
+        self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.pooler = FlaxViTPooler(self.config, dtype=self.dtype) if self.add_pooling_layer else None
+
+    def __call__(
+        self,
+        pixel_values,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embeddings(pixel_values, deterministic=deterministic)
+
+        outputs = self.encoder(
+            hidden_states,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.layernorm(hidden_states)
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+
+        return FlaxBaseModelOutputWithPooling(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare ViT Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class FlaxViTModel(FlaxViTPreTrainedModel):
+    module_class = FlaxViTModule
+
+
+FLAX_VISION_MODEL_DOCSTRING = """
+    Returns:
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxViTModel
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+    >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(FlaxViTModel, FLAX_VISION_MODEL_DOCSTRING)
+append_replace_return_docstrings(FlaxViTModel, output_type=FlaxBaseModelOutputWithPooling, config_class=ViTConfig)
+
+
+class FlaxViTForImageClassificationModule(nn.Module):
+    config: ViTConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.vit = FlaxViTModule(config=self.config, dtype=self.dtype, add_pooling_layer=False)
+        self.classifier = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                self.config.initializer_range**2, "fan_in", "truncated_normal"
+            ),
+        )
+
+    def __call__(
+        self,
+        pixel_values=None,
+        deterministic: bool = True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.classifier(hidden_states[:, 0, :])
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    VIT_START_DOCSTRING,
+)
+class FlaxViTForImageClassification(FlaxViTPreTrainedModel):
+    module_class = FlaxViTForImageClassificationModule
+
+
+FLAX_VISION_CLASSIF_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, FlaxViTForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
+
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+    >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
+
+    >>> inputs = image_processor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
+"""
+
+overwrite_call_docstring(FlaxViTForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
+append_replace_return_docstrings(
+    FlaxViTForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=ViTConfig
+)
+
+
+__all__ = ["FlaxViTForImageClassification", "FlaxViTModel", "FlaxViTPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/modeling_tf_vit.py b/docs/transformers/build/lib/transformers/models/vit/modeling_tf_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18b38e597f37298a75570a90f7c1019bf7c7788
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/modeling_tf_vit.py
@@ -0,0 +1,907 @@
+# coding=utf-8
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 ViT model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_vit import ViTConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+class TFViTEmbeddings(keras.layers.Layer):
+    """
+    Construct the CLS token, position and patch embeddings.
+
+    """
+
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.patch_embeddings = TFViTPatchEmbeddings(config, name="patch_embeddings")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def build(self, input_shape=None):
+        num_patches = self.patch_embeddings.num_patches
+        self.cls_token = self.add_weight(
+            shape=(1, 1, self.config.hidden_size),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="cls_token",
+        )
+        self.position_embeddings = self.add_weight(
+            shape=(1, num_patches + 1, self.config.hidden_size),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="position_embeddings",
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embeddings", None) is not None:
+            with tf.name_scope(self.patch_embeddings.name):
+                self.patch_embeddings.build(None)
+
+    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        batch_size, seq_len, dim = shape_list(embeddings)
+        num_patches = seq_len - 1
+
+        _, num_positions, _ = shape_list(self.position_embeddings)
+        num_positions -= 1
+
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(
+                patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+            ),
+            size=(h0, w0),
+            method="bicubic",
+        )
+
+        shape = shape_list(patch_pos_embed)
+        assert h0 == shape[-3] and w0 == shape[-2]
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
+
+    def call(
+        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        embeddings = self.patch_embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, training=training
+        )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
+        embeddings = tf.concat((cls_tokens, embeddings), axis=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings, training=training)
+
+        return embeddings
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+class TFViTPatchEmbeddings(keras.layers.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.num_channels = num_channels
+        self.config = config
+
+        self.projection = keras.layers.Conv2D(
+            filters=hidden_size,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="valid",
+            data_format="channels_last",
+            use_bias=True,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            bias_initializer="zeros",
+            name="projection",
+        )
+
+    def call(
+        self, pixel_values: tf.Tensor, interpolate_pos_encoding: bool = False, training: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        if tf.executing_eagerly() and num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if tf.executing_eagerly():
+                if height != self.image_size[0] or width != self.image_size[1]:
+                    raise ValueError(
+                        f"Input image size ({height}*{width}) doesn't match model"
+                        f" ({self.image_size[0]}*{self.image_size[1]})."
+                    )
+
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        projection = self.projection(pixel_values)
+
+        # Change the 2D spatial dimensions to a single temporal dimension.
+        # shape = (batch_size, num_patches, out_channels=embed_dim)
+        num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
+        embeddings = tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
+
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+
+
+class TFViTSelfAttention(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+        mixed_key_layer = self.key(inputs=hidden_states)
+        mixed_value_layer = self.value(inputs=hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+class TFViTSelfOutput(keras.layers.Layer):
+    """
+    The residual connection is defined in TFViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFViTAttention(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFViTSelfAttention(config, name="attention")
+        self.dense_output = TFViTSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor, head_mask=head_mask, output_attentions=output_attentions, training=training
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+class TFViTIntermediate(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFViTOutput(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+
+
+class TFViTLayer(keras.layers.Layer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFViTAttention(config, name="attention")
+        self.intermediate = TFViTIntermediate(config, name="intermediate")
+        self.vit_output = TFViTOutput(config, name="output")
+
+        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
+        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        attention_outputs = self.attention(
+            # in ViT, layernorm is applied before self-attention
+            input_tensor=self.layernorm_before(inputs=hidden_states),
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = attention_outputs[0]
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(inputs=hidden_states)
+
+        intermediate_output = self.intermediate(hidden_states=layer_output)
+
+        # second residual connection is done here
+        layer_output = self.vit_output(
+            hidden_states=intermediate_output, input_tensor=hidden_states, training=training
+        )
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "vit_output", None) is not None:
+            with tf.name_scope(self.vit_output.name):
+                self.vit_output.build(None)
+        if getattr(self, "layernorm_before", None) is not None:
+            with tf.name_scope(self.layernorm_before.name):
+                self.layernorm_before.build([None, None, self.config.hidden_size])
+        if getattr(self, "layernorm_after", None) is not None:
+            with tf.name_scope(self.layernorm_after.name):
+                self.layernorm_after.build([None, None, self.config.hidden_size])
+
+
+class TFViTEncoder(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layer = [TFViTLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFViTMainLayer(keras.layers.Layer):
+    config_class = ViTConfig
+
+    def __init__(self, config: ViTConfig, add_pooling_layer: bool = True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+
+        self.embeddings = TFViTEmbeddings(config, name="embeddings")
+        self.encoder = TFViTEncoder(config, name="encoder")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+        self.pooler = TFViTPooler(config, name="pooler") if add_pooling_layer else None
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            training=training,
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(inputs=sequence_output)
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+
+
+class TFViTPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+
+
+VIT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class TFViTModel(TFViTPreTrainedModel):
+    def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.vit = TFViTMainLayer(config, add_pooling_layer=add_pooling_layer, name="vit")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+        outputs = self.vit(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+
+
+class TFViTPooler(keras.layers.Layer):
+    def __init__(self, config: ViTConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.pooler_output_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation=config.pooler_act,
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
+    VIT_START_DOCSTRING,
+)
+class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: ViTConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.vit = TFViTMainLayer(config, add_pooling_layer=False, name="vit")
+
+        # Classifier head
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs = self.vit(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(inputs=sequence_output[:, 0, :])
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+__all__ = ["TFViTForImageClassification", "TFViTModel", "TFViTPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vit/modeling_vit.py b/docs/transformers/build/lib/transformers/models/vit/modeling_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d757aeaf28b1ac0a32f042d98ef4bf929fe7f910
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit/modeling_vit.py
@@ -0,0 +1,883 @@
+# coding=utf-8
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT model."""
+
+import collections.abc
+import math
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    MaskedImageModelingOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_vit import ViTConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+class ViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: ViTConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = ViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ViTPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class ViTSelfAttention(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ViTSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class ViTAttention(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.attention = ViTSelfAttention(config)
+        self.output = ViTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ViTIntermediate(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class ViTOutput(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class ViTLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTAttention(config)
+        self.intermediate = ViTIntermediate(config)
+        self.output = ViTOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class ViTEncoder(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTEmbeddings", "ViTLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+VIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class ViTModel(ViTPreTrainedModel):
+    def __init__(self, config: ViTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> ViTPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ViTPooler(nn.Module):
+    def __init__(self, config: ViTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """,
+    VIT_START_DOCSTRING,
+)
+class ViTForMaskedImageModeling(ViTPreTrainedModel):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__(config)
+
+        self.vit = ViTModel(config, add_pooling_layer=False, use_mask_token=True)
+
+        self.decoder = nn.Sequential(
+            nn.Conv2d(
+                in_channels=config.hidden_size,
+                out_channels=config.encoder_stride**2 * config.num_channels,
+                kernel_size=1,
+            ),
+            nn.PixelShuffle(config.encoder_stride),
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedImageModelingOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")
+
+        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
+        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+        >>> # create random boolean mask of shape (batch_size, num_patches)
+        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()
+
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
+        >>> list(reconstructed_pixel_values.shape)
+        [1, 3, 224, 224]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if bool_masked_pos is not None and (self.config.patch_size != self.config.encoder_stride):
+            raise ValueError(
+                "When `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that "
+                "the reconstructed image has the same dimensions as the input. "
+                f"Got `patch_size` = {self.config.patch_size} and `encoder_stride` = {self.config.encoder_stride}."
+            )
+
+        outputs = self.vit(
+            pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # Reshape to (batch_size, num_channels, height, width)
+        sequence_output = sequence_output[:, 1:]
+        batch_size, sequence_length, num_channels = sequence_output.shape
+        height = width = math.floor(sequence_length**0.5)
+        sequence_output = sequence_output.permute(0, 2, 1).reshape(batch_size, num_channels, height, width)
+
+        # Reconstruct pixel values
+        reconstructed_pixel_values = self.decoder(sequence_output)
+
+        masked_im_loss = None
+        if bool_masked_pos is not None:
+            size = self.config.image_size // self.config.patch_size
+            bool_masked_pos = bool_masked_pos.reshape(-1, size, size)
+            mask = (
+                bool_masked_pos.repeat_interleave(self.config.patch_size, 1)
+                .repeat_interleave(self.config.patch_size, 2)
+                .unsqueeze(1)
+                .contiguous()
+            )
+            reconstruction_loss = nn.functional.l1_loss(pixel_values, reconstructed_pixel_values, reduction="none")
+            masked_im_loss = (reconstruction_loss * mask).sum() / (mask.sum() + 1e-5) / self.config.num_channels
+
+        if not return_dict:
+            output = (reconstructed_pixel_values,) + outputs[1:]
+            return ((masked_im_loss,) + output) if masked_im_loss is not None else output
+
+        return MaskedImageModelingOutput(
+            loss=masked_im_loss,
+            reconstruction=reconstructed_pixel_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
+    VIT_START_DOCSTRING,
+)
+class ViTForImageClassification(ViTPreTrainedModel):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ViTForImageClassification", "ViTForMaskedImageModeling", "ViTModel", "ViTPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vit_mae/__init__.py b/docs/transformers/build/lib/transformers/models/vit_mae/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..253017c39d6a2bb0e835d83c23526c8d7b08665a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_mae/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vit_mae import *
+    from .modeling_tf_vit_mae import *
+    from .modeling_vit_mae import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vit_mae/configuration_vit_mae.py b/docs/transformers/build/lib/transformers/models/vit_mae/configuration_vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5ec3600599e6ad3f2708880ce35a9ec9a626e1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_mae/configuration_vit_mae.py
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT MAE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTMAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTMAEModel`]. It is used to instantiate an ViT
+    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [facebook/vit-mae-base](https://huggingface.co/facebook/vit-mae-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        mask_ratio (`float`, *optional*, defaults to 0.75):
+            The ratio of the number of masked tokens in the input sequence.
+        norm_pix_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
+            representation quality in the experiments of the authors.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTMAEConfig, ViTMAEModel
+
+    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> configuration = ViTMAEConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-mae-base style configuration
+    >>> model = ViTMAEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit_mae"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        mask_ratio=0.75,
+        norm_pix_loss=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
+
+
+__all__ = ["ViTMAEConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py b/docs/transformers/build/lib/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..47e77593f6fd3ad7c2b7ff2c329b84f432060c7d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_mae/convert_vit_mae_to_pytorch.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
+
+import argparse
+
+import requests
+import torch
+from PIL import Image
+
+from transformers import ViTMAEConfig, ViTMAEForPreTraining, ViTMAEImageProcessor
+
+
+def rename_key(name):
+    if "cls_token" in name:
+        name = name.replace("cls_token", "vit.embeddings.cls_token")
+    if "mask_token" in name:
+        name = name.replace("mask_token", "decoder.mask_token")
+    if "decoder_pos_embed" in name:
+        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
+    if "pos_embed" in name and "decoder" not in name:
+        name = name.replace("pos_embed", "vit.embeddings.position_embeddings")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "vit.embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "vit.embeddings.norm")
+    if "decoder_blocks" in name:
+        name = name.replace("decoder_blocks", "decoder.decoder_layers")
+    if "blocks" in name:
+        name = name.replace("blocks", "vit.encoder.layer")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn" in name:
+        name = name.replace("attn", "attention.self")
+    if "norm1" in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "decoder_embed" in name:
+        name = name.replace("decoder_embed", "decoder.decoder_embed")
+    if "decoder_norm" in name:
+        name = name.replace("decoder_norm", "decoder.decoder_norm")
+    if "decoder_pred" in name:
+        name = name.replace("decoder_pred", "decoder.decoder_pred")
+    if "norm.weight" in name and "decoder" not in name:
+        name = name.replace("norm.weight", "vit.layernorm.weight")
+    if "norm.bias" in name and "decoder" not in name:
+        name = name.replace("norm.bias", "vit.layernorm.bias")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[1])
+            if "decoder_blocks" in key:
+                dim = config.decoder_hidden_size
+                prefix = "decoder.decoder_layers."
+                if "weight" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                elif "bias" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+            else:
+                dim = config.hidden_size
+                prefix = "vit.encoder.layer."
+                if "weight" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                elif "bias" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_vit_mae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
+    config = ViTMAEConfig()
+    if "large" in checkpoint_url:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "huge" in checkpoint_url:
+        config.patch_size = 14
+        config.hidden_size = 1280
+        config.intermediate_size = 5120
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
+
+    model = ViTMAEForPreTraining(config)
+
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
+
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
+
+    new_state_dict = convert_state_dict(state_dict, config)
+
+    model.load_state_dict(new_state_dict)
+    model.eval()
+
+    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+
+    image = Image.open(requests.get(url, stream=True).raw)
+    image_processor = ViTMAEImageProcessor(size=config.image_size)
+    inputs = image_processor(images=image, return_tensors="pt")
+
+    # forward pass
+    torch.manual_seed(2)
+    outputs = model(**inputs)
+    logits = outputs.logits
+
+    if "large" in checkpoint_url:
+        expected_slice = torch.tensor(
+            [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
+        )
+    elif "huge" in checkpoint_url:
+        expected_slice = torch.tensor(
+            [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
+        )
+    else:
+        expected_slice = torch.tensor(
+            [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
+        )
+
+    # verify logits
+    assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_base.pth",
+        type=str,
+        help="URL of the checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vit_mae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/vit_mae/modeling_tf_vit_mae.py b/docs/transformers/build/lib/transformers/models/vit_mae/modeling_tf_vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..8879a8665f3ed6cf1009d4b7c2366807e7c51664
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_mae/modeling_tf_vit_mae.py
@@ -0,0 +1,1375 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 ViT MAE (masked autoencoder) model."""
+
+from __future__ import annotations
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutput
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import logging
+from .configuration_vit_mae import ViTMAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ViTMAEConfig"
+_CHECKPOINT_FOR_DOC = "facebook/vit-mae-base"
+
+
+@dataclass
+class TFViTMAEModelOutput(ModelOutput):
+    """
+    Class for TFViTMAEModel's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    last_hidden_state: Optional[tf.Tensor] = None
+    mask: Optional[tf.Tensor] = None
+    ids_restore: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFViTMAEDecoderOutput(ModelOutput):
+    """
+    Class for TFViTMAEDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFViTMAEForPreTrainingOutput(ModelOutput):
+    """
+    Class for TFViTMAEForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: Optional[tf.Tensor] = None
+    mask: Optional[tf.Tensor] = None
+    ids_restore: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    Create 2D sin/cos positional embeddings.
+
+    Args:
+        embed_dim (`int`):
+            Embedding dimension.
+        grid_size (`int`):
+            The grid height and width.
+        add_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a classification (CLS) token.
+
+    Returns:
+        (`tf.Tensor` of shape (grid_size*grid_size, embed_dim) or (1+grid_size*grid_size, embed_dim): the position
+        embeddings (with or without classification token)
+    """
+    grid_h = tf.range(grid_size, dtype=tf.float32)
+    grid_w = tf.range(grid_size, dtype=tf.float32)
+    grid = tf.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = tf.stack(grid, axis=0)
+
+    grid = tf.reshape(grid, [2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if add_cls_token:
+        pos_embed = tf.concat([tf.zeros((1, embed_dim)), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = tf.concat([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+
+    omega = tf.range(embed_dim // 2, dtype="float32")
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = tf.reshape(pos, [-1])  # (M,)
+    out = tf.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    # half of the positions get sinusoidal pattern and the rest gets
+    # cosine pattern and then they are concatenated
+    emb_sin = tf.sin(out)  # (M, D/2)
+    emb_cos = tf.cos(out)  # (M, D/2)
+
+    emb = tf.concat([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class TFViTMAEEmbeddings(keras.layers.Layer):
+    """
+    Construct the CLS token, position and patch embeddings.
+
+    """
+
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.patch_embeddings = TFViTMAEPatchEmbeddings(config, name="patch_embeddings")
+        self.num_patches = self.patch_embeddings.num_patches
+
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.cls_token = self.add_weight(
+            shape=(1, 1, self.config.hidden_size),
+            initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
+            trainable=True,
+            name="cls_token",
+        )
+        self.position_embeddings = self.add_weight(
+            shape=(1, self.num_patches + 1, self.config.hidden_size),
+            initializer="zeros",
+            trainable=False,  # fixed sin-cos embedding
+            name="position_embeddings",
+        )
+        pos_embed = get_2d_sincos_pos_embed(
+            self.position_embeddings.shape[-1],
+            int(self.patch_embeddings.num_patches**0.5),
+            add_cls_token=True,
+        )[None, ...]
+        self.position_embeddings.assign(pos_embed)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embeddings", None) is not None:
+            with tf.name_scope(self.patch_embeddings.name):
+                self.patch_embeddings.build(None)
+
+    def interpolate_pos_encoding(self, embeddings, height, width) -> tf.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        batch_size, seq_len, dim = shape_list(embeddings)
+        num_patches = seq_len - 1
+
+        _, num_positions, _ = shape_list(self.position_embeddings)
+        num_positions -= 1
+
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(
+                patch_pos_embed, shape=(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+            ),
+            size=(h0, w0),
+            method="bicubic",
+        )
+
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
+
+    def random_masking(self, sequence: tf.Tensor, noise: tf.Tensor | None = None):
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+
+        Args:
+            sequence (`tf.Tensor` of shape `(batch_size, sequence_length, dim)`)
+            noise (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size, seq_length, dim = shape_list(sequence)
+        len_keep = int(seq_length * (1 - self.config.mask_ratio))
+
+        if noise is None:
+            noise = tf.random.uniform(shape=(batch_size, seq_length), minval=0.0, maxval=1.0)  # noise in [0, 1)
+
+        # sort noise for each sample
+        ids_shuffle = tf.argsort(noise, axis=1)  # ascend: small is keep, large is remove
+        ids_restore = tf.argsort(ids_shuffle, axis=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        sequence_unmasked = tf.gather(
+            sequence,
+            axis=1,
+            batch_dims=1,
+            indices=ids_keep,
+        )
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        # this hack is needed because TF's EagerTensors don't support
+        # assignment
+        mask_keep = tf.zeros((batch_size, len_keep))
+        mask_remove = tf.ones((batch_size, seq_length - len_keep))
+        mask = tf.concat([mask_keep, mask_remove], axis=-1)
+
+        # unshuffle to get the binary mask
+        mask = tf.gather(mask, axis=1, batch_dims=1, indices=ids_restore)
+
+        return sequence_unmasked, mask, ids_restore
+
+    def call(
+        self, pixel_values: tf.Tensor, noise: Optional[tf.Tensor] = None, interpolate_pos_encoding: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if interpolate_pos_encoding:
+            position_embeddings = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embeddings = self.position_embeddings
+        # add position embeddings w/o cls token
+        embeddings = embeddings + position_embeddings[:, 1:, :]
+
+        # masking: length -> length * config.mask_ratio
+        embeddings, mask, ids_restore = self.random_masking(embeddings, noise)
+
+        # append cls token
+        cls_token = self.cls_token + position_embeddings[:, :1, :]
+        cls_tokens = tf.tile(cls_token, (shape_list(embeddings)[0], 1, 1))
+        embeddings = tf.concat([cls_tokens, embeddings], axis=1)
+
+        return embeddings, mask, ids_restore
+
+
+class TFViTMAEPatchEmbeddings(keras.layers.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.num_channels = num_channels
+        self.config = config
+
+        self.projection = keras.layers.Conv2D(
+            filters=hidden_size,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="valid",
+            data_format="channels_last",
+            kernel_initializer="glorot_uniform",  # following torch.nn.Linear
+            bias_initializer="zeros",
+            name="projection",
+        )
+
+    def call(
+        self, pixel_values: tf.Tensor, training: bool = False, interpolate_pos_encoding: bool = False
+    ) -> tf.Tensor:
+        batch_size, num_channels, height, width = shape_list(pixel_values)
+        if tf.executing_eagerly():
+            if num_channels != self.num_channels:
+                raise ValueError(
+                    "Make sure that the channel dimension of the pixel values match with the one set in the"
+                    " configuration."
+                )
+            if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        projection = self.projection(pixel_values)
+
+        # Change the 2D spatial dimensions to a single temporal dimension.
+        # shape = (batch_size, num_patches, out_channels=embed_dim)
+        num_patches = (width // self.patch_size[1]) * (height // self.patch_size[0])
+        x = tf.reshape(tensor=projection, shape=(batch_size, num_patches, -1))
+
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfAttention with ViT->ViTMAE
+class TFViTMAESelfAttention(keras.layers.Layer):
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+        mixed_key_layer = self.key(inputs=hidden_states)
+        mixed_value_layer = self.value(inputs=hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTSelfOutput with ViT->ViTMAE
+class TFViTMAESelfOutput(keras.layers.Layer):
+    """
+    The residual connection is defined in TFViTMAELayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTAttention with ViT->ViTMAE
+class TFViTMAEAttention(keras.layers.Layer):
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFViTMAESelfAttention(config, name="attention")
+        self.dense_output = TFViTMAESelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor, head_mask=head_mask, output_attentions=output_attentions, training=training
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTIntermediate with ViT->ViTMAE
+class TFViTMAEIntermediate(keras.layers.Layer):
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTOutput with ViT->ViTMAE
+class TFViTMAEOutput(keras.layers.Layer):
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTLayer with ViT->ViTMAE
+class TFViTMAELayer(keras.layers.Layer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFViTMAEAttention(config, name="attention")
+        self.intermediate = TFViTMAEIntermediate(config, name="intermediate")
+        self.vit_output = TFViTMAEOutput(config, name="output")
+
+        self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
+        self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        attention_outputs = self.attention(
+            # in ViTMAE, layernorm is applied before self-attention
+            input_tensor=self.layernorm_before(inputs=hidden_states),
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = attention_outputs[0]
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTMAE, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(inputs=hidden_states)
+
+        intermediate_output = self.intermediate(hidden_states=layer_output)
+
+        # second residual connection is done here
+        layer_output = self.vit_output(
+            hidden_states=intermediate_output, input_tensor=hidden_states, training=training
+        )
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "vit_output", None) is not None:
+            with tf.name_scope(self.vit_output.name):
+                self.vit_output.build(None)
+        if getattr(self, "layernorm_before", None) is not None:
+            with tf.name_scope(self.layernorm_before.name):
+                self.layernorm_before.build([None, None, self.config.hidden_size])
+        if getattr(self, "layernorm_after", None) is not None:
+            with tf.name_scope(self.layernorm_after.name):
+                self.layernorm_after.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.vit.modeling_tf_vit.TFViTEncoder with ViT->ViTMAE
+class TFViTMAEEncoder(keras.layers.Layer):
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layer = [TFViTMAELayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFViTMAEMainLayer(keras.layers.Layer):
+    config_class = ViTMAEConfig
+
+    def __init__(self, config: ViTMAEConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+
+        self.embeddings = TFViTMAEEmbeddings(config, name="embeddings")
+        self.encoder = TFViTMAEEncoder(config, name="encoder")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        noise: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[TFViTMAEModelOutput, Tuple[tf.Tensor]]:
+        embedding_output, mask, ids_restore = self.embeddings(
+            pixel_values=pixel_values,
+            training=training,
+            noise=noise,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(inputs=sequence_output)
+
+        if not return_dict:
+            return (sequence_output, mask, ids_restore) + encoder_outputs[1:]
+
+        return TFViTMAEModelOutput(
+            last_hidden_state=sequence_output,
+            mask=mask,
+            ids_restore=ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_size])
+
+
+class TFViTMAEPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTMAEConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+
+
+VIT_MAE_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`ViTMAEConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_MAE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+            in eager mode, in graph mode the value will always be set to True.
+
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the position encodings at the encoder and decoder.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViTMAE Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_MAE_START_DOCSTRING,
+)
+class TFViTMAEModel(TFViTMAEPreTrainedModel):
+    def __init__(self, config: ViTMAEConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.vit = TFViTMAEMainLayer(config, name="vit")
+
+    def get_input_embeddings(self):
+        return self.vit.get_input_embeddings()
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(VIT_MAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFViTMAEModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        noise: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[TFViTMAEModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, TFViTMAEModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
+        >>> model = TFViTMAEModel.from_pretrained("facebook/vit-mae-base")
+
+        >>> inputs = image_processor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        outputs = self.vit(
+            pixel_values=pixel_values,
+            noise=noise,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+
+
+class TFViTMAEDecoder(keras.layers.Layer):
+    def __init__(self, config, num_patches, **kwargs):
+        super().__init__(**kwargs)
+        self.decoder_embed = keras.layers.Dense(config.decoder_hidden_size, name="decoder_embed")
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = [
+            TFViTMAELayer(decoder_config, name=f"decoder_layers.{j}") for j in range(config.decoder_num_hidden_layers)
+        ]
+
+        self.decoder_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="decoder_norm")
+        self.decoder_pred = keras.layers.Dense(
+            config.patch_size**2 * config.num_channels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="decoder_pred",
+        )  # encoder to decoder
+        self.config = config
+        self.num_patches = num_patches
+
+    def build(self, input_shape=None):
+        self.mask_token = self.add_weight(
+            shape=(1, 1, self.config.decoder_hidden_size),
+            initializer=tf.random_normal_initializer(stddev=self.config.initializer_range),
+            trainable=True,
+            name="mask_token",
+        )
+        self.decoder_pos_embed = self.add_weight(
+            shape=(1, self.num_patches + 1, self.config.decoder_hidden_size),
+            initializer="zeros",
+            trainable=False,
+            name="decoder_pos_embed",
+        )
+        decoder_pos_embed = get_2d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1],
+            int(self.num_patches**0.5),
+            add_cls_token=True,
+        )[None, ...]
+        self.decoder_pos_embed.assign(decoder_pos_embed)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "decoder_embed", None) is not None:
+            with tf.name_scope(self.decoder_embed.name):
+                self.decoder_embed.build([None, None, self.config.hidden_size])
+        if getattr(self, "decoder_norm", None) is not None:
+            with tf.name_scope(self.decoder_norm.name):
+                self.decoder_norm.build([None, None, self.config.decoder_hidden_size])
+        if getattr(self, "decoder_pred", None) is not None:
+            with tf.name_scope(self.decoder_pred.name):
+                self.decoder_pred.build([None, None, self.config.decoder_hidden_size])
+        if getattr(self, "decoder_layers", None) is not None:
+            for layer in self.decoder_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+    def interpolate_pos_encoding(self, embeddings) -> tf.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the deocder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        # [batch_size, num_patches + 1, hidden_size]
+        _, num_positions, dim = shape_list(self.decoder_pos_embed)
+
+        # -1 removes the class dimension since we later append it without interpolation
+        seq_len = shape_list(embeddings)[1] - 1
+        num_positions = num_positions - 1
+
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, :1, :]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:, :]
+
+        # interpolate the position embeddings
+        patch_pos_embed = tf.image.resize(
+            images=tf.reshape(patch_pos_embed, shape=(1, 1, -1, dim)),
+            size=(1, seq_len),
+            method="bicubic",
+        )
+
+        # [1, seq_len, hidden_size]
+        patch_pos_embed = tf.reshape(tensor=patch_pos_embed, shape=(1, -1, dim))
+        # Adding the class token back
+        return tf.concat(values=(class_pos_embed, patch_pos_embed), axis=1)
+
+    def call(
+        self,
+        hidden_states,
+        ids_restore,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        interpolate_pos_encoding=False,
+    ):
+        # embed tokens
+        x = self.decoder_embed(hidden_states)
+        # append mask tokens to sequence
+        mask_tokens = tf.tile(
+            self.mask_token,
+            (shape_list(x)[0], shape_list(ids_restore)[1] + 1 - shape_list(x)[1], 1),
+        )
+        x_ = tf.concat([x[:, 1:, :], mask_tokens], axis=1)  # no cls token
+        x_ = tf.gather(x_, axis=1, batch_dims=1, indices=ids_restore)  # unshuffle
+        x = tf.concat([x[:, :1, :], x_], axis=1)  # append cls token
+        if interpolate_pos_encoding:
+            decoder_pos_embed = self.interpolate_pos_encoding(x)
+        else:
+            decoder_pos_embed = self.decoder_pos_embed
+        # add pos embed
+        hidden_states = x + decoder_pos_embed
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states,
+                head_mask=None,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # predictor projection
+        logits = self.decoder_pred(hidden_states)
+
+        # remove cls token
+        logits = logits[:, 1:, :]
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return TFViTMAEDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
+
+
+@add_start_docstrings(
+    "The ViTMAE Model transformer with the decoder on top for self-supervised pre-training.",
+    VIT_MAE_START_DOCSTRING,
+)
+class TFViTMAEForPreTraining(TFViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.vit = TFViTMAEMainLayer(config, name="vit")
+        self.decoder = TFViTMAEDecoder(
+            config,
+            num_patches=self.vit.embeddings.num_patches,
+            name="decoder",
+        )
+
+    def get_input_embeddings(self):
+        return self.vit.get_input_embeddings()
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def patchify(self, pixel_values, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`tf.Tensor` of shape `(batch_size, height, width, num_channels)` or `(batch_size, num_channels, height, width)`):
+                Pixel values.
+            interpolate_pos_encoding (`bool`, default `False`):
+                interpolation flag passed during the forward pass.
+
+        Returns:
+            `tf.Tensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        # make sure channels are last
+        if shape_list(pixel_values)[1] == num_channels:
+            pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        # sanity checks
+        if not interpolate_pos_encoding:
+            tf.debugging.assert_equal(
+                shape_list(pixel_values)[1],
+                shape_list(pixel_values)[2],
+                message="Make sure the pixel values have a squared size",
+            )
+        tf.debugging.assert_equal(
+            shape_list(pixel_values)[1] % patch_size,
+            0,
+            message="Make sure the pixel values have a size that is divisible by the patch size",
+        )
+        tf.debugging.assert_equal(
+            shape_list(pixel_values)[3],
+            num_channels,
+            message=(
+                "Make sure the number of channels of the pixel values is equal to the one set in the configuration"
+            ),
+        )
+
+        # patchify
+        batch_size = shape_list(pixel_values)[0]
+        num_patches_h = shape_list(pixel_values)[1] // patch_size
+        num_patches_w = shape_list(pixel_values)[2] // patch_size
+        patchified_pixel_values = tf.reshape(
+            pixel_values,
+            (batch_size, num_patches_h, patch_size, num_patches_w, patch_size, num_channels),
+        )
+        patchified_pixel_values = tf.einsum("nhpwqc->nhwpqc", patchified_pixel_values)
+        patchified_pixel_values = tf.reshape(
+            patchified_pixel_values,
+            (batch_size, num_patches_h * num_patches_w, patch_size**2 * num_channels),
+        )
+        return patchified_pixel_values
+
+    def unpatchify(self, patchified_pixel_values, original_image_size: Optional[Tuple[int, int]] = None):
+        """
+        Args:
+            patchified_pixel_values (`tf.Tensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
+
+        Returns:
+            `tf.Tensor` of shape `(batch_size, height, width, num_channels)`:
+                Pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
+        # sanity check
+        tf.debugging.assert_equal(
+            num_patches_h * num_patches_w,
+            shape_list(patchified_pixel_values)[1],
+            message=f"The number of patches in the patchified pixel values is {shape_list(patchified_pixel_values)[1]} does not match the patches of original image {num_patches_w}*{num_patches_h}",
+        )
+
+        # unpatchify
+        batch_size = shape_list(patchified_pixel_values)[0]
+        patchified_pixel_values = tf.reshape(
+            patchified_pixel_values,
+            (batch_size, num_patches_h, num_patches_w, patch_size, patch_size, num_channels),
+        )
+        patchified_pixel_values = tf.einsum("nhwpqc->nhpwqc", patchified_pixel_values)
+        pixel_values = tf.reshape(
+            patchified_pixel_values,
+            (batch_size, num_patches_h * patch_size, num_patches_w * patch_size, num_channels),
+        )
+        return pixel_values
+
+    def forward_loss(self, pixel_values, pred, mask, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`tf.Tensor` of shape `(batch_size, height, width, num_channels)`):
+                Pixel values.
+            pred (`tf.Tensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Predicted pixel values.
+            mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Tensor indicating which patches are masked (1) and which are not (0).
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
+
+        Returns:
+            `tf.Tensor`: Pixel reconstruction loss.
+        """
+        target = self.patchify(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if self.config.norm_pix_loss:
+            mean = tf.reduce_mean(target, axis=-1, keepdims=True)
+            var = tf.math.reduce_variance(target, axis=-1, keepdims=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+
+        loss = (pred - target) ** 2
+        loss = tf.reduce_mean(loss, axis=-1)  # [batch_size, num_patches], mean loss per patch
+
+        loss = tf.reduce_sum(loss * mask) / tf.reduce_sum(mask)  # mean loss on removed patches
+        loss = tf.reshape(loss, (1,))
+        return loss
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(VIT_MAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFViTMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        noise: Optional[tf.Tensor] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[TFViTMAEForPreTrainingOutput, Tuple[tf.Tensor]]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, TFViTMAEForPreTraining
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
+        >>> model = TFViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        >>> mask = outputs.mask
+        >>> ids_restore = outputs.ids_restore
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values=pixel_values,
+            noise=noise,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        latent = outputs.last_hidden_state
+        ids_restore = outputs.ids_restore
+        mask = outputs.mask
+
+        # [batch_size, num_patches, patch_size**2*3]
+        decoder_outputs = self.decoder(latent, ids_restore, interpolate_pos_encoding=interpolate_pos_encoding)
+        logits = decoder_outputs.logits
+
+        loss = self.forward_loss(pixel_values, logits, mask, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if not return_dict:
+            output = (logits, mask, ids_restore) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFViTMAEForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            mask=mask,
+            ids_restore=ids_restore,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vit", None) is not None:
+            with tf.name_scope(self.vit.name):
+                self.vit.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+__all__ = ["TFViTMAEForPreTraining", "TFViTMAEModel", "TFViTMAEPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vit_mae/modeling_vit_mae.py b/docs/transformers/build/lib/transformers/models/vit_mae/modeling_vit_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..4636519ee67e519cb77d08f4897956e856c19b80
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_mae/modeling_vit_mae.py
@@ -0,0 +1,1163 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT MAE (masked autoencoder) model."""
+
+import collections.abc
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Callable, Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_vit_mae import ViTMAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ViTMAEConfig"
+_CHECKPOINT_FOR_DOC = "facebook/vit-mae-base"
+
+
+@dataclass
+class ViTMAEModelOutput(ModelOutput):
+    """
+    Class for ViTMAEModel's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ViTMAEDecoderOutput(ModelOutput):
+    """
+    Class for ViTMAEDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class ViTMAEForPreTrainingOutput(ModelOutput):
+    """
+    Class for ViTMAEForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mask: Optional[torch.LongTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    Create 2D sin/cos positional embeddings.
+
+    Args:
+        embed_dim (`int`):
+            Embedding dimension.
+        grid_size (`int`):
+            The grid height and width.
+        add_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a classification (CLS) token.
+
+    Returns:
+        (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or (1+grid_size*grid_size, embed_dim): the
+        position embeddings (with or without classification token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if add_cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class ViTMAEEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = ViTMAEPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+        # fixed sin-cos embedding
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, config.hidden_size), requires_grad=False
+        )
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def initialize_weights(self):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(
+            self.position_embeddings.shape[-1], int(self.patch_embeddings.num_patches**0.5), add_cls_token=True
+        )
+        self.position_embeddings.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+
+        # initialize patch_embeddings like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embeddings.projection.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=self.config.initializer_range)
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def random_masking(self, sequence, noise=None):
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+
+        Args:
+            sequence (`torch.LongTensor` of shape `(batch_size, sequence_length, dim)`)
+            noise (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size, seq_length, dim = sequence.shape
+        len_keep = int(seq_length * (1 - self.config.mask_ratio))
+
+        if noise is None:
+            noise = torch.rand(batch_size, seq_length, device=sequence.device)  # noise in [0, 1]
+
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1).to(sequence.device)  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(sequence.device)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        sequence_unmasked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim))
+
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([batch_size, seq_length], device=sequence.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        return sequence_unmasked, mask, ids_restore
+
+    def forward(self, pixel_values, noise=None, interpolate_pos_encoding: bool = False):
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if interpolate_pos_encoding:
+            position_embeddings = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embeddings = self.position_embeddings
+
+        # add position embeddings w/o cls token
+        embeddings = embeddings + position_embeddings[:, 1:, :]
+
+        # masking: length -> length * config.mask_ratio
+        embeddings, mask, ids_restore = self.random_masking(embeddings, noise)
+
+        # append cls token
+        cls_token = self.cls_token + position_embeddings[:, :1, :]
+        cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        return embeddings, mask, ids_restore
+
+
+class ViTMAEPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        if not interpolate_pos_encoding and (height != self.image_size[0] or width != self.image_size[1]):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->ViTMAE
+class ViTMAESelfAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMAE
+class ViTMAESelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTMAELayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTMAE
+class ViTMAEAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.attention = ViTMAESelfAttention(config)
+        self.output = ViTMAESelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->ViTMAE
+class ViTMAEIntermediate(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->ViTMAE
+class ViTMAEOutput(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMAE,VIT->VITMAE
+class ViTMAELayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTMAEAttention(config)
+        self.intermediate = ViTMAEIntermediate(config)
+        self.output = ViTMAEOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTMAE, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTMAE, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTMAE
+class ViTMAEEncoder(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTMAELayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTMAEPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTMAEConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTMAEEmbeddings):
+            module.initialize_weights()
+        elif isinstance(module, ViTMAEDecoder):
+            module.mask_token.data.zero_()
+            module.decoder_pos_embed.data.zero_()
+
+
+VIT_MAE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTMAEConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_MAE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, *optional*, default `False`):
+            Whether to interpolate the pre-trained position encodings. This is mainly used to use the model on higher
+            resolution images.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViTMAE Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_MAE_START_DOCSTRING,
+)
+class ViTMAEModel(ViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTMAEEmbeddings(config)
+        self.encoder = ViTMAEEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_MAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ViTMAEModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, ViTMAEModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTMAEModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
+        >>> model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output, mask, ids_restore = self.embeddings(
+            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            return (sequence_output, mask, ids_restore) + encoder_outputs[1:]
+
+        return ViTMAEModelOutput(
+            last_hidden_state=sequence_output,
+            mask=mask,
+            ids_restore=ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ViTMAEDecoder(nn.Module):
+    def __init__(self, config, num_patches):
+        super().__init__()
+        self.decoder_embed = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, config.decoder_hidden_size), requires_grad=False
+        )  # fixed sin-cos embedding
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [ViTMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+        self.decoder_pred = nn.Linear(
+            config.decoder_hidden_size, config.patch_size**2 * config.num_channels, bias=True
+        )  # encoder to decoder
+        self.gradient_checkpointing = False
+        self.config = config
+        self.initialize_weights(num_patches)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the decoder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+
+        Adapted from:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+
+        # -1 removes the class dimension since we later append it without interpolation
+        embeddings_positions = embeddings.shape[1] - 1
+
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, :1]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:]
+
+        # To retain the final 3d tensor with the required dimensions
+        dim = self.decoder_pos_embed.shape[-1]
+
+        # Increasing a dimension to enable bicubic interpolation
+        patch_pos_embed = patch_pos_embed.reshape(1, 1, -1, dim)
+
+        # permute to bring the dimension to be interpolated, to the last
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Interpolating the decoder position embeddings shape wrt embeddings shape i.e (x).
+        # we keep the second last dimension constant
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(patch_pos_embed.shape[-2], embeddings_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        # Converting back to the original shape
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Adding the class token back
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def initialize_weights(self, num_patches):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        decoder_pos_embed = get_2d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1], int(num_patches**0.5), add_cls_token=True
+        )
+        self.decoder_pos_embed.data.copy_(torch.from_numpy(decoder_pos_embed).float().unsqueeze(0))
+
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.mask_token, std=self.config.initializer_range)
+
+    def forward(
+        self,
+        hidden_states,
+        ids_restore,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        interpolate_pos_encoding: bool = False,
+    ):
+        # embed tokens
+        x = self.decoder_embed(hidden_states)
+
+        # append mask tokens to sequence
+        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        # unshuffle
+        x_ = torch.gather(x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]).to(x_.device))
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
+        # add pos embed
+        if interpolate_pos_encoding:
+            decoder_pos_embed = self.interpolate_pos_encoding(x)
+        else:
+            decoder_pos_embed = self.decoder_pos_embed
+        hidden_states = x + decoder_pos_embed
+
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    None,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, head_mask=None, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # predictor projection
+        logits = self.decoder_pred(hidden_states)
+
+        # remove cls token
+        logits = logits[:, 1:, :]
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return ViTMAEDecoderOutput(
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    """The ViTMAE Model transformer with the decoder on top for self-supervised pre-training.
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+
+    """,
+    VIT_MAE_START_DOCSTRING,
+)
+class ViTMAEForPreTraining(ViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.vit = ViTMAEModel(config)
+        self.decoder = ViTMAEDecoder(config, num_patches=self.vit.embeddings.num_patches)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.vit.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def patchify(self, pixel_values, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values.
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        # sanity checks
+        if not interpolate_pos_encoding and (
+            pixel_values.shape[2] != pixel_values.shape[3] or pixel_values.shape[2] % patch_size != 0
+        ):
+            raise ValueError("Make sure the pixel values have a squared size that is divisible by the patch size")
+        if pixel_values.shape[1] != num_channels:
+            raise ValueError(
+                "Make sure the number of channels of the pixel values is equal to the one set in the configuration"
+            )
+
+        # patchify
+        batch_size = pixel_values.shape[0]
+        num_patches_h = pixel_values.shape[2] // patch_size
+        num_patches_w = pixel_values.shape[3] // patch_size
+        patchified_pixel_values = pixel_values.reshape(
+            batch_size, num_channels, num_patches_h, patch_size, num_patches_w, patch_size
+        )
+        patchified_pixel_values = torch.einsum("nchpwq->nhwpqc", patchified_pixel_values)
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            batch_size, num_patches_h * num_patches_w, patch_size**2 * num_channels
+        )
+        return patchified_pixel_values
+
+    def unpatchify(self, patchified_pixel_values, original_image_size: Optional[Tuple[int, int]] = None):
+        """
+        Args:
+            patchified_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+                Pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
+        # sanity check
+        if num_patches_h * num_patches_w != patchified_pixel_values.shape[1]:
+            raise ValueError(
+                f"The number of patches in the patchified pixel values {patchified_pixel_values.shape[1]}, does not match the number of patches on original image {num_patches_h}*{num_patches_w}"
+            )
+
+        # unpatchify
+        batch_size = patchified_pixel_values.shape[0]
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_patches_h,
+            num_patches_w,
+            patch_size,
+            patch_size,
+            num_channels,
+        )
+        patchified_pixel_values = torch.einsum("nhwpqc->nchpwq", patchified_pixel_values)
+        pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_channels,
+            num_patches_h * patch_size,
+            num_patches_w * patch_size,
+        )
+        return pixel_values
+
+    def forward_loss(self, pixel_values, pred, mask, interpolate_pos_encoding: bool = False):
+        """
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values.
+            pred (`torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Predicted pixel values.
+            mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Tensor indicating which patches are masked (1) and which are not (0).
+            interpolate_pos_encoding (`bool`, *optional*, default `False`):
+                interpolation flag passed during the forward pass.
+
+        Returns:
+            `torch.FloatTensor`: Pixel reconstruction loss.
+        """
+        target = self.patchify(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if self.config.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.0e-6) ** 0.5
+
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    @add_start_docstrings_to_model_forward(VIT_MAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ViTMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, ViTMAEForPreTrainingOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTMAEForPreTraining
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
+        >>> model = ViTMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        >>> mask = outputs.mask
+        >>> ids_restore = outputs.ids_restore
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            noise=noise,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        latent = outputs.last_hidden_state
+        ids_restore = outputs.ids_restore
+        mask = outputs.mask
+
+        decoder_outputs = self.decoder(latent, ids_restore, interpolate_pos_encoding=interpolate_pos_encoding)
+        logits = decoder_outputs.logits  # shape (batch_size, num_patches, patch_size*patch_size*num_channels)
+
+        loss = self.forward_loss(pixel_values, logits, mask, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if not return_dict:
+            output = (logits, mask, ids_restore) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ViTMAEForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            mask=mask,
+            ids_restore=ids_restore,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ViTMAEForPreTraining", "ViTMAELayer", "ViTMAEModel", "ViTMAEPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vit_msn/__init__.py b/docs/transformers/build/lib/transformers/models/vit_msn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..547c5f4c04b912fe69a09470106c0d523df63931
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_msn/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vit_msn import *
+    from .modeling_vit_msn import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vit_msn/configuration_vit_msn.py b/docs/transformers/build/lib/transformers/models/vit_msn/configuration_vit_msn.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd47df3e9932e0a7e4fa1a9084976214fa9d2939
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_msn/configuration_vit_msn.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT MSN model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTMSNConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTMSNModel`]. It is used to instantiate an ViT
+    MSN model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [facebook/vit_msn_base](https://huggingface.co/facebook/vit_msn_base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTMSNModel, ViTMSNConfig
+
+    >>> # Initializing a ViT MSN vit-msn-base style configuration
+    >>> configuration = ViTConfig()
+
+    >>> # Initializing a model from the vit-msn-base style configuration
+    >>> model = ViTMSNModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit_msn"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+
+__all__ = ["ViTMSNConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vit_msn/convert_msn_to_pytorch.py b/docs/transformers/build/lib/transformers/models/vit_msn/convert_msn_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..899c74f183205e9fdc18984a1f15e877bc64fe31
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_msn/convert_msn_to_pytorch.py
@@ -0,0 +1,245 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT MSN checkpoints from the original repository: https://github.com/facebookresearch/msn"""
+
+import argparse
+import json
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import ViTImageProcessor, ViTMSNConfig, ViTMSNModel
+from transformers.image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+
+torch.set_grad_enabled(False)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config, base_model=False):
+    rename_keys = []
+    for i in range(config.num_hidden_layers):
+        # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
+        rename_keys.append((f"module.blocks.{i}.norm1.weight", f"vit.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"module.blocks.{i}.norm1.bias", f"vit.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append(
+            (f"module.blocks.{i}.attn.proj.weight", f"vit.encoder.layer.{i}.attention.output.dense.weight")
+        )
+        rename_keys.append((f"module.blocks.{i}.attn.proj.bias", f"vit.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"module.blocks.{i}.norm2.weight", f"vit.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"module.blocks.{i}.norm2.bias", f"vit.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"module.blocks.{i}.mlp.fc1.weight", f"vit.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"module.blocks.{i}.mlp.fc1.bias", f"vit.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"module.blocks.{i}.mlp.fc2.weight", f"vit.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"module.blocks.{i}.mlp.fc2.bias", f"vit.encoder.layer.{i}.output.dense.bias"))
+
+    # projection layer + position embeddings
+    rename_keys.extend(
+        [
+            ("module.cls_token", "vit.embeddings.cls_token"),
+            ("module.patch_embed.proj.weight", "vit.embeddings.patch_embeddings.projection.weight"),
+            ("module.patch_embed.proj.bias", "vit.embeddings.patch_embeddings.projection.bias"),
+            ("module.pos_embed", "vit.embeddings.position_embeddings"),
+        ]
+    )
+
+    if base_model:
+        # layernorm + pooler
+        rename_keys.extend(
+            [
+                ("module.norm.weight", "layernorm.weight"),
+                ("module.norm.bias", "layernorm.bias"),
+            ]
+        )
+
+        # if just the base model, we should remove "vit" from all keys that start with "vit"
+        rename_keys = [(pair[0], pair[1][4:]) if pair[1].startswith("vit") else pair for pair in rename_keys]
+    else:
+        # layernorm + classification head
+        rename_keys.extend(
+            [
+                ("norm.weight", "vit.layernorm.weight"),
+                ("norm.bias", "vit.layernorm.bias"),
+                ("head.weight", "classifier.weight"),
+                ("head.bias", "classifier.bias"),
+            ]
+        )
+
+    return rename_keys
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config, base_model=False):
+    for i in range(config.num_hidden_layers):
+        if base_model:
+            prefix = ""
+        else:
+            prefix = "vit."
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"module.blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"module.blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
+            : config.hidden_size, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
+            -config.hidden_size :, :
+        ]
+        state_dict[f"{prefix}encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def remove_classification_head_(state_dict):
+    ignore_keys = ["head.weight", "head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def remove_projection_head(state_dict):
+    # projection head is used in the self-supervised pre-training in MSN,
+    # for downstream task it's not needed.
+    ignore_keys = [
+        "module.fc.fc1.weight",
+        "module.fc.fc1.bias",
+        "module.fc.bn1.weight",
+        "module.fc.bn1.bias",
+        "module.fc.bn1.running_mean",
+        "module.fc.bn1.running_var",
+        "module.fc.bn1.num_batches_tracked",
+        "module.fc.fc2.weight",
+        "module.fc.fc2.bias",
+        "module.fc.bn2.weight",
+        "module.fc.bn2.bias",
+        "module.fc.bn2.running_mean",
+        "module.fc.bn2.running_var",
+        "module.fc.bn2.num_batches_tracked",
+        "module.fc.fc3.weight",
+        "module.fc.fc3.bias",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def convert_vit_msn_checkpoint(checkpoint_url, pytorch_dump_folder_path):
+    config = ViTMSNConfig()
+    config.num_labels = 1000
+
+    repo_id = "datasets/huggingface/label-files"
+    filename = "imagenet-1k-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    if "s16" in checkpoint_url:
+        config.hidden_size = 384
+        config.intermediate_size = 1536
+        config.num_attention_heads = 6
+    elif "l16" in checkpoint_url:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+        config.hidden_dropout_prob = 0.1
+    elif "b4" in checkpoint_url:
+        config.patch_size = 4
+    elif "l7" in checkpoint_url:
+        config.patch_size = 7
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+        config.hidden_dropout_prob = 0.1
+
+    model = ViTMSNModel(config)
+
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["target_encoder"]
+
+    image_processor = ViTImageProcessor(size=config.image_size)
+
+    remove_projection_head(state_dict)
+    rename_keys = create_rename_keys(config, base_model=True)
+
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    read_in_q_k_v(state_dict, config, base_model=True)
+
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+    image = Image.open(requests.get(url, stream=True).raw)
+    image_processor = ViTImageProcessor(
+        size=config.image_size, image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD
+    )
+    inputs = image_processor(images=image, return_tensors="pt")
+
+    # forward pass
+    torch.manual_seed(2)
+    outputs = model(**inputs)
+    last_hidden_state = outputs.last_hidden_state
+
+    # The following Colab Notebook was used to generate these outputs:
+    # https://colab.research.google.com/gist/sayakpaul/3672419a04f5997827503fd84079bdd1/scratchpad.ipynb
+    if "s16" in checkpoint_url:
+        expected_slice = torch.tensor([[-1.0915, -1.4876, -1.1809]])
+    elif "b16" in checkpoint_url:
+        expected_slice = torch.tensor([[14.2889, -18.9045, 11.7281]])
+    elif "l16" in checkpoint_url:
+        expected_slice = torch.tensor([[41.5028, -22.8681, 45.6475]])
+    elif "b4" in checkpoint_url:
+        expected_slice = torch.tensor([[-4.3868, 5.2932, -0.4137]])
+    else:
+        expected_slice = torch.tensor([[-0.1792, -0.6465, 2.4263]])
+
+    # verify logits
+    assert torch.allclose(last_hidden_state[:, 0, :3], expected_slice, atol=1e-4)
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://dl.fbaipublicfiles.com/msn/vits16_800ep.pth.tar",
+        type=str,
+        help="URL of the checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_vit_msn_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/vit_msn/modeling_vit_msn.py b/docs/transformers/build/lib/transformers/models/vit_msn/modeling_vit_msn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5a3d56ba607f2319ca2132e2f6b6ad9341bafd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vit_msn/modeling_vit_msn.py
@@ -0,0 +1,741 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT MSN (masked siamese network) model."""
+
+import collections.abc
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_vit_msn import ViTMSNConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "ViTMSNConfig"
+_CHECKPOINT_FOR_DOC = "facebook/vit-msn-small"
+
+
+class ViTMSNEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: ViTMSNConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = ViTMSNPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTPatchEmbeddings with ViT->ViTMSN
+class ViTMSNPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->ViTMSN
+class ViTMSNSelfAttention(nn.Module):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMSN
+class ViTMSNSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTMSNLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTMSN
+class ViTMSNAttention(nn.Module):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.attention = ViTMSNSelfAttention(config)
+        self.output = ViTMSNSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->ViTMSN
+class ViTMSNIntermediate(nn.Module):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->ViTMSN
+class ViTMSNOutput(nn.Module):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMSN, VIT->VITMSN
+class ViTMSNLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTMSNAttention(config)
+        self.intermediate = ViTMSNIntermediate(config)
+        self.output = ViTMSNOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTMSN, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViTMSN, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTMSN
+class ViTMSNEncoder(nn.Module):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTMSNLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTMSNPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ViTMSNConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTMSNAttention", "ViTMSNSdpaAttention"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    # todo: Resort to https://github.com/facebookresearch/msn/blob/main/src/deit.py#L200-#L211
+    # when creating pre-training scripts.
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTMSNEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+VIT_MSN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTMSNConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_MSN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViTMSN Model outputting raw hidden-states without any specific head on top.",
+    VIT_MSN_START_DOCSTRING,
+)
+class ViTMSNModel(ViTMSNPreTrainedModel):
+    def __init__(self, config: ViTMSNConfig, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTMSNEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTMSNEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> ViTMSNPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_MSN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTMSNModel
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-msn-small")
+        >>> model = ViTMSNModel.from_pretrained("facebook/vit-msn-small")
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# Caution: We don't have the weights for the classification head yet. This class
+# is here for the users that are interested to fine-tune the base model (ViTMSNModel).
+@add_start_docstrings(
+    """
+    ViTMSN Model with an image classification head on top e.g. for ImageNet.
+    """,
+    VIT_MSN_START_DOCSTRING,
+)
+class ViTMSNForImageClassification(ViTMSNPreTrainedModel):
+    def __init__(self, config: ViTMSNConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTMSNModel(config)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_MSN_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTMSNForImageClassification
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> torch.manual_seed(2)  # doctest: +IGNORE_RESULT
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-msn-small")
+        >>> model = ViTMSNForImageClassification.from_pretrained("facebook/vit-msn-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        tusker
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ViTMSNModel", "ViTMSNForImageClassification", "ViTMSNPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vitdet/__init__.py b/docs/transformers/build/lib/transformers/models/vitdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96b2fdf7d6247455c115ef40700507076fe6a12
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitdet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vitdet import *
+    from .modeling_vitdet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vitdet/configuration_vitdet.py b/docs/transformers/build/lib/transformers/models/vitdet/configuration_vitdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd91dce9b2961e92112dd2589213c0979a03ad3d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitdet/configuration_vitdet.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitDet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitDetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitDetModel`]. It is used to instantiate an
+    VitDet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitDet
+    [google/vitdet-base-patch16-224](https://huggingface.co/google/vitdet-base-patch16-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of mlp hidden dim to embedding dim.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        pretrain_image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image during pretraining.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+        window_block_indices (`List[int]`, *optional*, defaults to `[]`):
+            List of indices of blocks that should have window attention instead of regular global self-attention.
+        residual_block_indices (`List[int]`, *optional*, defaults to `[]`):
+            List of indices of blocks that should have an extra residual block after the MLP.
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to add absolute position embeddings to the patch embeddings.
+        use_relative_position_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to add relative position embeddings to the attention maps.
+        window_size (`int`, *optional*, defaults to 0):
+            The size of the attention window.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitDetConfig, VitDetModel
+
+    >>> # Initializing a VitDet configuration
+    >>> configuration = VitDetConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitDetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitdet"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        pretrain_image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        window_block_indices=[],
+        residual_block_indices=[],
+        use_absolute_position_embeddings=True,
+        use_relative_position_embeddings=False,
+        window_size=0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.dropout_prob = dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.pretrain_image_size = pretrain_image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.drop_path_rate = drop_path_rate
+        self.window_block_indices = window_block_indices
+        self.residual_block_indices = residual_block_indices
+        self.use_absolute_position_embeddings = use_absolute_position_embeddings
+        self.use_relative_position_embeddings = use_relative_position_embeddings
+        self.window_size = window_size
+
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["VitDetConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vitdet/modeling_vitdet.py b/docs/transformers/build/lib/transformers/models/vitdet/modeling_vitdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d740522884d07b03c182944f6a6644f25c225de
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitdet/modeling_vitdet.py
@@ -0,0 +1,883 @@
+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViTDet backbone."""
+
+import collections.abc
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitdet import VitDetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitDetConfig"
+
+
+class VitDetEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) to be consumed by a Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.pretrain_image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        if config.use_absolute_position_embeddings:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_positions = num_patches + 1
+            self.position_embeddings = nn.Parameter(torch.zeros(1, num_positions, config.hidden_size))
+        else:
+            self.position_embeddings = None
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def get_absolute_positions(self, abs_pos_embeddings, has_cls_token, height, width):
+        """
+        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
+        original embeddings.
+
+        Args:
+            abs_pos_embeddings (`torch.Tensor`):
+                Absolute positional embeddings with (1, num_position, num_channels).
+            has_cls_token (`bool`):
+                If true, has 1 embedding in abs_pos_embeddings for cls token.
+            height (`int`):
+                Height of input image tokens.
+            width (`int`):
+                Width of input image tokens.
+
+        Returns:
+            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
+        """
+        if has_cls_token:
+            abs_pos_embeddings = abs_pos_embeddings[:, 1:]
+        num_position = abs_pos_embeddings.shape[1]
+        size = int(math.sqrt(num_position))  # This is a constant and can be recorded as such in the ONNX export.
+        if size * size != num_position:
+            raise ValueError("Absolute position embeddings must be a square number.")
+
+        if torch.jit.is_tracing() or (size != height or size != width):
+            # nn.functional.interpolate is a noop in case size == height and size == width - we need to always capture this path with jit.trace.
+            new_abs_pos_embeddings = nn.functional.interpolate(
+                abs_pos_embeddings.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+                size=(height, width),
+                mode="bicubic",
+                align_corners=False,
+            )
+
+            return new_abs_pos_embeddings.permute(0, 2, 3, 1)
+        else:
+            return abs_pos_embeddings.reshape(1, height, width, -1)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values)
+
+        if self.position_embeddings is not None:
+            # (batch_size, num_channels, height, width) -> (batch_size, height, width, num_channels)
+            embeddings = embeddings.permute(0, 2, 3, 1)
+            # add position embeddings
+            embeddings = embeddings + self.get_absolute_positions(
+                self.position_embeddings, True, embeddings.shape[1], embeddings.shape[2]
+            )
+            # (batch_size, height, width, num_channels) -> (batch_size, num_channels, height, width)
+            embeddings = embeddings.permute(0, 3, 1, 2)
+
+        return embeddings
+
+
+@torch.jit.script_if_tracing  # nn.functional.interpolate's `size` needs to be dynamic.
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of query and key sizes.
+
+    Args:
+        q_size (`int`):
+            Size of query q.
+        k_size (`int`):
+            Size of key k.
+        rel_pos (`torch.Tensor`):
+            Relative position embeddings (num_embeddings, num_channels).
+
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel position embeddings.
+        rel_pos_resized = nn.functional.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+    return rel_pos_resized[relative_coords.long()]
+
+
+def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings as introduced in
+    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).
+
+    Args:
+        attn (`torch.Tensor`):
+            Attention map.
+        queries (`torch.Tensor`):
+            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
+        rel_pos_h (`torch.Tensor`):
+            Relative position embeddings (Lh, num_channels) for height axis.
+        rel_pos_w (`torch.Tensor`):
+            Relative position embeddings (Lw, num_channels) for width axis.
+        q_size (`Tuple[int]`):
+            Spatial sequence size of query q with (queries_height, queries_width).
+        k_size (`Tuple[int]`):
+            Spatial sequence size of key k with (keys_height, keys_width).
+
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    queries_height, queries_width = q_size
+    keys_height, keys_width = k_size
+    relative_height = get_rel_pos(queries_height, keys_height, rel_pos_h)
+    relative_width = get_rel_pos(queries_width, keys_width, rel_pos_w)
+
+    batch_size, _, dim = queries.shape
+    r_q = queries.reshape(batch_size, queries_height, queries_width, dim)
+    relative_height = torch.einsum("bhwc,hkc->bhwk", r_q, relative_height)
+    relative_weight = torch.einsum("bhwc,wkc->bhwk", r_q, relative_width)
+
+    attn = (
+        attn.view(batch_size, queries_height, queries_width, keys_height, keys_width)
+        + relative_height[:, :, :, :, None]
+        + relative_weight[:, :, :, None, :]
+    ).view(batch_size, queries_height * queries_width, keys_height * keys_width)
+
+    return attn
+
+
+class VitDetAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, input_size=None):
+        """
+        Args:
+            config (`VitDetConfig`):
+                Model configuration.
+            input_size (`Tuple[int]`, *optional*):
+                Input resolution, only required in case relative position embeddings are added.
+        """
+        super().__init__()
+
+        dim = config.hidden_size
+        num_heads = config.num_attention_heads
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_relative_position_embeddings = config.use_relative_position_embeddings
+        if self.use_relative_position_embeddings:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, hidden_state, output_attentions=False):
+        batch_size, height, width, _ = hidden_state.shape
+        # qkv with shape (3, batch_size, num_heads, height * width, num_channels)
+        qkv = self.qkv(hidden_state).reshape(batch_size, height * width, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # queries, keys and values have shape (batch_size * num_heads, height * width, num_channels)
+        queries, keys, values = qkv.reshape(3, batch_size * self.num_heads, height * width, -1).unbind(0)
+
+        attention_scores = (queries * self.scale) @ keys.transpose(-2, -1)
+
+        if self.use_relative_position_embeddings:
+            attention_scores = add_decomposed_relative_positions(
+                attention_scores, queries, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attention_probs = attention_scores.softmax(dim=-1)
+
+        hidden_state = attention_probs @ values
+        hidden_state = hidden_state.view(batch_size, self.num_heads, height, width, -1)
+        hidden_state = hidden_state.permute(0, 2, 3, 1, 4)
+        hidden_state = hidden_state.reshape(batch_size, height, width, -1)
+        hidden_state = self.proj(hidden_state)
+
+        if output_attentions:
+            attention_probs = attention_probs.reshape(
+                batch_size, self.num_heads, attention_probs.shape[-2], attention_probs.shape[-1]
+            )
+            outputs = (hidden_state, attention_probs)
+        else:
+            outputs = (hidden_state,)
+
+        return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class VitDetDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class VitDetLayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
+    channel dimension for inputs that have shape (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class VitDetResBottleneckBlock(nn.Module):
+    """
+    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1.
+    """
+
+    def __init__(self, config, in_channels, out_channels, bottleneck_channels):
+        """
+        Args:
+            config (`VitDetConfig`):
+                Model configuration.
+            in_channels (`int`):
+                Number of input channels.
+            out_channels (`int`):
+                Number of output channels.
+            bottleneck_channels (`int`):
+                Number of output channels for the 3x3 "bottleneck" conv layers.
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = VitDetLayerNorm(bottleneck_channels)
+        self.act1 = ACT2FN[config.hidden_act]
+
+        self.conv2 = nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False)
+        self.norm2 = VitDetLayerNorm(bottleneck_channels)
+        self.act2 = ACT2FN[config.hidden_act]
+
+        self.conv3 = nn.Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = VitDetLayerNorm(out_channels)
+
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+
+        out = x + out
+        return out
+
+
+class VitDetMlp(nn.Module):
+    def __init__(self, config, in_features: int, hidden_features: int) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.drop = nn.Dropout(config.dropout_prob)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+def window_partition(hidden_state, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+
+    Args:
+        hidden_state (`torch.Tensor`):
+            Input tokens with [batch_size, height, width, num_channels].
+        window_size (`int`):
+            Window size.
+
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements:
+        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
+        - (padded_height, padded_width): padded height and width before partition
+    """
+    batch_size, height, width, num_channels = hidden_state.shape
+
+    pad_height = (window_size - height % window_size) % window_size
+    pad_width = (window_size - width % window_size) % window_size
+
+    # Noop in case pad_width == 0 and pad_height == 0.
+    hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height))
+
+    padded_height, padded_width = height + pad_height, width + pad_width
+
+    hidden_state = hidden_state.view(
+        batch_size, padded_height // window_size, window_size, padded_width // window_size, window_size, num_channels
+    )
+    windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows, (padded_height, padded_width)
+
+
+def window_unpartition(windows, window_size, pad_height_width, height_width):
+    """
+    Window unpartition into original sequences and removing padding.
+
+    Args:
+        windows (`torch.Tensor`):
+            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
+        window_size (`int`):
+            Window size.
+        pad_height_width (`Tuple[int]`):
+            Padded height and width (padded_height, padded_width).
+        height_width (`Tuple[int]`):
+            Original height and width before padding.
+
+    Returns:
+        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
+    """
+    padded_height, padded_width = pad_height_width
+    height, width = height_width
+    batch_size = windows.shape[0] // (padded_height * padded_width // window_size // window_size)
+    hidden_state = windows.view(
+        batch_size, padded_height // window_size, padded_width // window_size, window_size, window_size, -1
+    )
+    hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous()
+    hidden_state = hidden_state.view(batch_size, padded_height, padded_width, -1)
+
+    # We always have height <= padded_height and width <= padded_width
+    hidden_state = hidden_state[:, :height, :width, :].contiguous()
+    return hidden_state
+
+
+class VitDetLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(
+        self, config: VitDetConfig, drop_path_rate: float = 0, window_size: int = 0, use_residual_block: bool = False
+    ) -> None:
+        super().__init__()
+
+        dim = config.hidden_size
+
+        image_size = config.image_size
+        image_size = image_size if isinstance(image_size, (list, tuple)) else (image_size, image_size)
+
+        patch_size = config.patch_size
+        patch_size = patch_size if isinstance(patch_size, (list, tuple)) else (patch_size, patch_size)
+
+        input_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = VitDetAttention(
+            config, input_size=input_size if window_size == 0 else (window_size, window_size)
+        )
+
+        self.drop_path = VitDetDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.mlp = VitDetMlp(config=config, in_features=dim, hidden_features=int(dim * config.mlp_ratio))
+
+        self.window_size = window_size
+
+        self.use_residual_block = use_residual_block
+        if self.use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = VitDetResBottleneckBlock(
+                config=config,
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+
+        shortcut = hidden_states
+
+        hidden_states = self.norm1(hidden_states)
+
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, pad_height_width = window_partition(hidden_states, self.window_size)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = window_unpartition(hidden_states, self.window_size, pad_height_width, (height, width))
+
+        # first residual connection
+        hidden_states = shortcut + self.drop_path(hidden_states)
+
+        hidden_states = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states)))
+
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+        if self.use_residual_block:
+            hidden_states = self.residual(hidden_states)
+
+        outputs = (hidden_states,) + outputs
+
+        return outputs
+
+
+class VitDetEncoder(nn.Module):
+    def __init__(self, config: VitDetConfig) -> None:
+        super().__init__()
+        self.config = config
+        depth = config.num_hidden_layers
+
+        # stochastic depth decay rule
+        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, depth, device="cpu")]
+
+        layers = []
+        for i in range(depth):
+            layers.append(
+                VitDetLayer(
+                    config,
+                    drop_path_rate=drop_path_rate[i],
+                    window_size=config.window_size if i in config.window_block_indices else 0,
+                    use_residual_block=i in config.residual_block_indices,
+                )
+            )
+
+        self.layer = nn.ModuleList(layers)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+def caffe2_msra_fill(module: nn.Module) -> None:
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.
+
+    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+class VitDetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitDetConfig
+    base_model_prefix = "vitdet"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        elif isinstance(module, VitDetEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+        elif isinstance(module, VitDetAttention) and self.config.use_relative_position_embeddings:
+            module.rel_pos_h.data = nn.init.trunc_normal_(
+                module.rel_pos_h.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+            module.rel_pos_w.data = nn.init.trunc_normal_(
+                module.rel_pos_w.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            )
+
+        elif isinstance(module, VitDetResBottleneckBlock):
+            for layer in [module.conv1, module.conv2, module.conv3]:
+                caffe2_msra_fill(layer)
+            for layer in [module.norm1, module.norm2]:
+                layer.weight.data.fill_(1.0)
+                layer.bias.data.zero_()
+            # zero init last norm layer.
+            module.norm3.weight.data.zero_()
+            module.norm3.bias.data.zero_()
+
+
+VITDET_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITDET_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.",
+    VITDET_START_DOCSTRING,
+)
+class VitDetModel(VitDetPreTrainedModel):
+    def __init__(self, config: VitDetConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VitDetEmbeddings(config)
+        self.encoder = VitDetEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> VitDetEmbeddings:
+        return self.embeddings.projection
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitDetConfig, VitDetModel
+        >>> import torch
+
+        >>> config = VitDetConfig()
+        >>> model = VitDetModel(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 768, 14, 14]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ViTDet backbone, to be used with frameworks like Mask R-CNN.
+    """,
+    VITDET_START_DOCSTRING,
+)
+class VitDetBackbone(VitDetPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.embeddings = VitDetEmbeddings(config)
+        self.encoder = VitDetEncoder(config)
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> VitDetEmbeddings:
+        return self.embeddings.projection
+
+    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitDetConfig, VitDetBackbone
+        >>> import torch
+
+        >>> config = VitDetConfig()
+        >>> model = VitDetBackbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(pixel_values)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 14, 14]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VitDetModel", "VitDetPreTrainedModel", "VitDetBackbone"]
diff --git a/docs/transformers/build/lib/transformers/models/vitmatte/__init__.py b/docs/transformers/build/lib/transformers/models/vitmatte/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b87cea448ab5296902d91d351d06252c52a1386
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitmatte/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vitmatte import *
+    from .image_processing_vitmatte import *
+    from .modeling_vitmatte import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vitmatte/configuration_vitmatte.py b/docs/transformers/build/lib/transformers/models/vitmatte/configuration_vitmatte.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f78043306b72e3951ecb16bb1bfcb868abac20
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitmatte/configuration_vitmatte.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitMatte model configuration"""
+
+import copy
+from typing import List
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitMatteConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of [`VitMatteForImageMatting`]. It is used to
+    instantiate a ViTMatte model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ViTMatte
+    [hustvl/vitmatte-small-composition-1k](https://huggingface.co/hustvl/vitmatte-small-composition-1k) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitDetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        hidden_size (`int`, *optional*, defaults to 384):
+            The number of input channels of the decoder.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch norm layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        convstream_hidden_sizes (`List[int]`, *optional*, defaults to `[48, 96, 192]`):
+            The output channels of the ConvStream module.
+        fusion_hidden_sizes (`List[int]`, *optional*, defaults to `[256, 128, 64, 32]`):
+            The output channels of the Fusion blocks.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitMatteConfig, VitMatteForImageMatting
+
+    >>> # Initializing a ViTMatte hustvl/vitmatte-small-composition-1k style configuration
+    >>> configuration = VitMatteConfig()
+
+    >>> # Initializing a model (with random weights) from the hustvl/vitmatte-small-composition-1k style configuration
+    >>> model = VitMatteForImageMatting(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitmatte"
+
+    def __init__(
+        self,
+        backbone_config: PretrainedConfig = None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        hidden_size: int = 384,
+        batch_norm_eps: float = 1e-5,
+        initializer_range: float = 0.02,
+        convstream_hidden_sizes: List[int] = [48, 96, 192],
+        fusion_hidden_sizes: List[int] = [256, 128, 64, 32],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
+            backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.batch_norm_eps = batch_norm_eps
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.convstream_hidden_sizes = convstream_hidden_sizes
+        self.fusion_hidden_sizes = fusion_hidden_sizes
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
+
+
+__all__ = ["VitMatteConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vitmatte/convert_vitmatte_to_hf.py b/docs/transformers/build/lib/transformers/models/vitmatte/convert_vitmatte_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..5153e1faf525ea767e81e120a341acdfc60e5373
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitmatte/convert_vitmatte_to_hf.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VitMatte checkpoints from the original repository.
+
+URL: https://github.com/hustvl/ViTMatte
+"""
+
+import argparse
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor
+
+
+def get_config(model_name):
+    hidden_size = 384 if "small" in model_name else 768
+    num_attention_heads = 6 if "small" in model_name else 12
+
+    backbone_config = VitDetConfig(
+        num_channels=4,
+        image_size=512,
+        pretrain_image_size=224,
+        patch_size=16,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_absolute_position_embeddings=True,
+        use_relative_position_embeddings=True,
+        window_size=14,
+        # 2, 5, 8, 11 for global attention
+        window_block_indices=[0, 1, 3, 4, 6, 7, 9, 10],
+        residual_block_indices=[2, 5, 8, 11],
+        out_features=["stage12"],
+    )
+
+    return VitMatteConfig(backbone_config=backbone_config, hidden_size=hidden_size)
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # fmt: off
+    # stem
+    rename_keys.append(("backbone.pos_embed", "backbone.embeddings.position_embeddings"))
+    rename_keys.append(("backbone.patch_embed.proj.weight", "backbone.embeddings.projection.weight"))
+    rename_keys.append(("backbone.patch_embed.proj.bias", "backbone.embeddings.projection.bias"))
+    # fmt: on
+
+    return rename_keys
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    config = get_config(model_name)
+
+    # load original state dict
+    model_name_to_filename = {
+        "vitmatte-small-composition-1k": "ViTMatte_S_Com.pth",
+        "vitmatte-base-composition-1k": "ViTMatte_B_Com.pth",
+        "vitmatte-small-distinctions-646": "ViTMatte_S_DIS.pth",
+        "vitmatte-base-distinctions-646": "ViTMatte_B_DIS.pth",
+    }
+
+    filename = model_name_to_filename[model_name]
+    filepath = hf_hub_download(repo_id="nielsr/vitmatte-checkpoints", filename=filename, repo_type="model")
+    state_dict = torch.load(filepath, map_location="cpu", weights_only=True)
+
+    # rename keys
+    for key in state_dict.copy().keys():
+        val = state_dict.pop(key)
+        if "backbone.blocks" in key:
+            key = key.replace("backbone.blocks", "backbone.encoder.layer")
+        if "attn" in key:
+            key = key.replace("attn", "attention")
+        if "fusion_blks" in key:
+            key = key.replace("fusion_blks", "fusion_blocks")
+        if "bn" in key:
+            key = key.replace("bn", "batch_norm")
+        state_dict[key] = val
+
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+
+    # create model
+    processor = VitMatteImageProcessor()
+    model = VitMatteForImageMatting(config)
+    model.eval()
+
+    # load state dict
+    model.load_state_dict(state_dict)
+
+    # verify on dummy image + trimap
+    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_rgb.png?raw=true"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_trimap.png?raw=true"
+    trimap = Image.open(requests.get(url, stream=True).raw)
+
+    pixel_values = processor(images=image, trimaps=trimap.convert("L"), return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        alphas = model(pixel_values).alphas
+
+    if model_name == "vitmatte-small-composition-1k":
+        expected_slice = torch.tensor([[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]])
+    elif model_name == "vitmatte-base-composition-1k":
+        expected_slice = torch.tensor([[0.9972, 0.9971, 0.9981], [0.9948, 0.9987, 0.9994], [0.9963, 0.9992, 0.9995]])
+    elif model_name == "vitmatte-small-distinctions-646":
+        expected_slice = torch.tensor([[0.9880, 0.9970, 0.9972], [0.9960, 0.9996, 0.9997], [0.9963, 0.9996, 0.9997]])
+    elif model_name == "vitmatte-base-distinctions-646":
+        expected_slice = torch.tensor([[0.9963, 0.9998, 0.9999], [0.9995, 1.0000, 1.0000], [0.9992, 0.9999, 1.0000]])
+
+    assert torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print(f"Pushing model and processor for {model_name} to hub")
+        model.push_to_hub(f"hustvl/{model_name}")
+        processor.push_to_hub(f"hustvl/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="vitmatte-small-composition-1k",
+        type=str,
+        choices=[
+            "vitmatte-small-composition-1k",
+            "vitmatte-base-composition-1k",
+            "vitmatte-small-distinctions-646",
+            "vitmatte-base-distinctions-646",
+        ],
+        help="Name of the VitMatte model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_vitmatte_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/docs/transformers/build/lib/transformers/models/vitmatte/image_processing_vitmatte.py b/docs/transformers/build/lib/transformers/models/vitmatte/image_processing_vitmatte.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c3b06e08815e2b8ebac1864da691900de660156
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitmatte/image_processing_vitmatte.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViTMatte."""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import pad, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitMatteImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViTMatte image processor.
+
+    Args:
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to make the width and height divisible by `size_divisibility`. Can be overridden
+            by the `do_pad` parameter in the `preprocess` method.
+        size_divisibility (`int`, *optional*, defaults to 32):
+            The width and height of the image will be padded to be divisible by this number.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: bool = True,
+        size_divisibility: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.do_pad = do_pad
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.size_divisibility = size_divisibility
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        size_divisibility: int = 32,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            size_divisibility (`int`, *optional*, defaults to 32):
+                The width and height of the image will be padded to be divisible by this number.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+
+        pad_height = 0 if height % size_divisibility == 0 else size_divisibility - height % size_divisibility
+        pad_width = 0 if width % size_divisibility == 0 else size_divisibility - width % size_divisibility
+        if pad_width + pad_height > 0:
+            padding = ((0, pad_height), (0, pad_width))
+            image = pad(image, padding=padding, data_format=data_format, input_data_format=input_data_format)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_data_format)
+
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        trimaps: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_pad: Optional[bool] = None,
+        size_divisibility: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            trimaps (`ImageInput`):
+                Trimap to preprocess.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image.
+            size_divisibility (`int`, *optional*, defaults to `self.size_divisibility`):
+                The size divisibility to pad the image to if `do_pad` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        size_divisibility = size_divisibility if size_divisibility is not None else self.size_divisibility
+
+        images = make_list_of_images(images)
+        trimaps = make_list_of_images(trimaps, expected_ndims=2)
+
+        if not valid_images(trimaps):
+            raise ValueError(
+                "Invalid trimap type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_pad=do_pad,
+            size_divisibility=size_divisibility,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        trimaps = [to_numpy_array(trimap) for trimap in trimaps]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+            trimaps = [
+                self.rescale(image=trimap, scale=rescale_factor, input_data_format=input_data_format)
+                for trimap in trimaps
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        # concatenate images and trimaps
+        images = [
+            np.concatenate([image, np.expand_dims(trimap, axis=-1)], axis=-1) for image, trimap in zip(images, trimaps)
+        ]
+
+        if do_pad:
+            images = [
+                self.pad_image(image, size_divisibility=size_divisibility, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image=image, channel_dim=data_format, input_channel_dim=input_data_format)
+            for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["VitMatteImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/vitmatte/modeling_vitmatte.py b/docs/transformers/build/lib/transformers/models/vitmatte/modeling_vitmatte.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa3aa5b883b2483bb9070002fbc7aba3ee1ffe01
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitmatte/modeling_vitmatte.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2023 HUST-VL and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViTMatte model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_vitmatte import VitMatteConfig
+
+
+# General docstring
+_CONFIG_FOR_DOC = "VitMatteConfig"
+
+
+@dataclass
+class ImageMattingOutput(ModelOutput):
+    """
+    Class for outputs of image matting models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Loss.
+        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+           Estimated alpha values.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    alphas: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class VitMattePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitMatteConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+class VitMatteBasicConv3x3(nn.Module):
+    """
+    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
+    """
+
+    def __init__(self, config, in_channels, out_channels, stride=2, padding=1):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=padding,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
+        self.relu = nn.ReLU()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.batch_norm(hidden_state)
+        hidden_state = self.relu(hidden_state)
+
+        return hidden_state
+
+
+class VitMatteConvStream(nn.Module):
+    """
+    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        # We use a default in-case there isn't a backbone config set. This is for backwards compatibility and
+        # to enable loading HF backbone models.
+        in_channels = 4
+        if config.backbone_config is not None:
+            in_channels = config.backbone_config.num_channels
+
+        out_channels = config.convstream_hidden_sizes
+
+        self.convs = nn.ModuleList()
+        self.conv_chans = [in_channels] + out_channels
+
+        for i in range(len(self.conv_chans) - 1):
+            in_chan_ = self.conv_chans[i]
+            out_chan_ = self.conv_chans[i + 1]
+            self.convs.append(VitMatteBasicConv3x3(config, in_chan_, out_chan_))
+
+    def forward(self, pixel_values):
+        out_dict = {"detailed_feature_map_0": pixel_values}
+        embeddings = pixel_values
+        for i in range(len(self.convs)):
+            embeddings = self.convs[i](embeddings)
+            name_ = "detailed_feature_map_" + str(i + 1)
+            out_dict[name_] = embeddings
+
+        return out_dict
+
+
+class VitMatteFusionBlock(nn.Module):
+    """
+    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
+    """
+
+    def __init__(self, config, in_channels, out_channels):
+        super().__init__()
+        self.conv = VitMatteBasicConv3x3(config, in_channels, out_channels, stride=1, padding=1)
+
+    def forward(self, features, detailed_feature_map):
+        upscaled_features = nn.functional.interpolate(features, scale_factor=2, mode="bilinear", align_corners=False)
+        out = torch.cat([detailed_feature_map, upscaled_features], dim=1)
+        out = self.conv(out)
+
+        return out
+
+
+class VitMatteHead(nn.Module):
+    """
+    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        in_channels = config.fusion_hidden_sizes[-1]
+        mid_channels = 16
+
+        self.matting_convs = nn.Sequential(
+            nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(mid_channels),
+            nn.ReLU(True),
+            nn.Conv2d(mid_channels, 1, kernel_size=1, stride=1, padding=0),
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.matting_convs(hidden_state)
+
+        return hidden_state
+
+
+class VitMatteDetailCaptureModule(nn.Module):
+    """
+    Simple and lightweight Detail Capture Module for ViT Matting.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if len(config.fusion_hidden_sizes) != len(config.convstream_hidden_sizes) + 1:
+            raise ValueError(
+                "The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1."
+            )
+
+        self.config = config
+        self.convstream = VitMatteConvStream(config)
+        self.conv_chans = self.convstream.conv_chans
+
+        self.fusion_blocks = nn.ModuleList()
+        self.fusion_channels = [config.hidden_size] + config.fusion_hidden_sizes
+
+        for i in range(len(self.fusion_channels) - 1):
+            self.fusion_blocks.append(
+                VitMatteFusionBlock(
+                    config=config,
+                    in_channels=self.fusion_channels[i] + self.conv_chans[-(i + 1)],
+                    out_channels=self.fusion_channels[i + 1],
+                )
+            )
+
+        self.matting_head = VitMatteHead(config)
+
+    def forward(self, features, pixel_values):
+        detail_features = self.convstream(pixel_values)
+        for i in range(len(self.fusion_blocks)):
+            detailed_feature_map_name = "detailed_feature_map_" + str(len(self.fusion_blocks) - i - 1)
+            features = self.fusion_blocks[i](features, detail_features[detailed_feature_map_name])
+
+        alphas = torch.sigmoid(self.matting_head(features))
+
+        return alphas
+
+
+VITMATTE_START_DOCSTRING = r"""
+    Parameters:
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITMATTE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`VitMatteImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers in case the backbone has them. See
+            `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers of the backbone. See `hidden_states` under
+            returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
+    VITMATTE_START_DOCSTRING,
+)
+class VitMatteForImageMatting(VitMattePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.backbone = load_backbone(config)
+        self.decoder = VitMatteDetailCaptureModule(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITMATTE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=ImageMattingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth image matting for computing the loss.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
+        >>> import torch
+        >>> from PIL import Image
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
+        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")
+
+        >>> filepath = hf_hub_download(
+        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
+        ... )
+        >>> image = Image.open(filepath).convert("RGB")
+        >>> filepath = hf_hub_download(
+        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
+        ... )
+        >>> trimap = Image.open(filepath).convert("L")
+
+        >>> # prepare image + trimap for the model
+        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     alphas = model(**inputs).alphas
+        >>> print(alphas.shape)
+        torch.Size([1, 1, 640, 960])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not yet supported")
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+
+        features = outputs.feature_maps[-1]
+        alphas = self.decoder(features, pixel_values)
+
+        if not return_dict:
+            output = (alphas,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageMattingOutput(
+            loss=loss,
+            alphas=alphas,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VitMattePreTrainedModel", "VitMatteForImageMatting"]
diff --git a/docs/transformers/build/lib/transformers/models/vitpose/__init__.py b/docs/transformers/build/lib/transformers/models/vitpose/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a57524cce2143830e294c1920e395436d670add
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vitpose import *
+    from .image_processing_vitpose import *
+    from .modeling_vitpose import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vitpose/configuration_vitpose.py b/docs/transformers/build/lib/transformers/models/vitpose/configuration_vitpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..aba8fec7ae41a1ebd7caf27f582b3b1913cad06c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose/configuration_vitpose.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitPose model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitPoseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitPoseForPoseEstimation`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `VitPoseBackboneConfig()`):
+            The configuration of the backbone model. Currently, only `backbone_config` with `vitpose_backbone` as `model_type` is supported.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_factor (`int`, *optional*, defaults to 4):
+            Factor to upscale the feature maps coming from the ViT backbone.
+        use_simple_decoder (`bool`, *optional*, defaults to `True`):
+            Whether to use a `VitPoseSimpleDecoder` to decode the feature maps from the backbone into heatmaps. Otherwise it uses `VitPoseClassicDecoder`.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import VitPoseConfig, VitPoseForPoseEstimation
+
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitPoseForPoseEstimation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitpose"
+
+    def __init__(
+        self,
+        backbone_config: Optional[PretrainedConfig] = None,
+        backbone: Optional[str] = None,
+        use_pretrained_backbone: bool = False,
+        use_timm_backbone: bool = False,
+        backbone_kwargs: Optional[dict] = None,
+        initializer_range: float = 0.02,
+        scale_factor: int = 4,
+        use_simple_decoder: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if use_pretrained_backbone:
+            logger.info(
+                "`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value."
+            )
+        if use_timm_backbone:
+            raise ValueError("use_timm_backbone set `True` is not supported at the moment.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitPose` backbone.")
+            backbone_config = CONFIG_MAPPING["vitpose_backbone"](out_indices=[4])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+
+        self.initializer_range = initializer_range
+        self.scale_factor = scale_factor
+        self.use_simple_decoder = use_simple_decoder
+
+
+__all__ = ["VitPoseConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vitpose/convert_vitpose_to_hf.py b/docs/transformers/build/lib/transformers/models/vitpose/convert_vitpose_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4666751a1078d487756f9681cd9fd21fbc58212
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose/convert_vitpose_to_hf.py
@@ -0,0 +1,428 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VitPose checkpoints from the original repository.
+
+URL: https://github.com/vitae-transformer/vitpose
+
+Notebook to get the original logits: https://colab.research.google.com/drive/1QDX_2POTpl6JaZAV2WIFjuiqDsDwiqMZ?usp=sharing.
+"""
+
+import argparse
+import os
+import re
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import VitPoseBackboneConfig, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor
+
+
+ORIGINAL_TO_CONVERTED_KEY_MAPPING = {
+    r"patch_embed.proj": "embeddings.patch_embeddings.projection",
+    r"pos_embed": "embeddings.position_embeddings",
+    r"blocks": "encoder.layer",
+    r"attn.proj": "attention.output.dense",
+    r"attn": "attention.self",
+    r"norm1": "layernorm_before",
+    r"norm2": "layernorm_after",
+    r"last_norm": "layernorm",
+    r"keypoint_head": "head",
+    r"final_layer": "conv",
+}
+
+MODEL_TO_FILE_NAME_MAPPING = {
+    # VitPose models, simple decoder
+    "vitpose-base-simple": "vitpose-b-simple.pth",
+    # VitPose models, classic decoder
+    "vitpose-base": "vitpose-b.pth",
+    # VitPose models, COCO-AIC-MPII
+    "vitpose-base-coco-aic-mpii": "vitpose_base_coco_aic_mpii.pth",
+    # VitPose+ models
+    "vitpose-plus-small": "vitpose+_small.pth",
+    "vitpose-plus-base": "vitpose+_base.pth",
+    "vitpose-plus-large": "vitpose+_large.pth",
+    "vitpose-plus-huge": "vitpose+_huge.pth",
+}
+
+
+def get_config(model_name):
+    if "plus" in model_name:
+        num_experts = 6
+        if "small" in model_name:
+            part_features = 96
+            out_indices = [12]
+        elif "base" in model_name:
+            part_features = 192
+            out_indices = [12]
+        elif "large" in model_name:
+            part_features = 256
+            out_indices = [24]
+        elif "huge" in model_name:
+            part_features = 320
+            out_indices = [32]
+        else:
+            raise ValueError(f"Model {model_name} not supported")
+    else:
+        num_experts = 1
+        part_features = 0
+
+    # size of the architecture
+    if "small" in model_name:
+        hidden_size = 384
+        num_hidden_layers = 12
+        num_attention_heads = 12
+    elif "large" in model_name:
+        hidden_size = 1024
+        num_hidden_layers = 24
+        num_attention_heads = 16
+    elif "huge" in model_name:
+        hidden_size = 1280
+        num_hidden_layers = 32
+        num_attention_heads = 16
+
+    backbone_config = VitPoseBackboneConfig(
+        out_indices=out_indices,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        num_experts=num_experts,
+        part_features=part_features,
+    )
+
+    use_simple_decoder = "simple" in model_name
+
+    edges = [
+        [15, 13],
+        [13, 11],
+        [16, 14],
+        [14, 12],
+        [11, 12],
+        [5, 11],
+        [6, 12],
+        [5, 6],
+        [5, 7],
+        [6, 8],
+        [7, 9],
+        [8, 10],
+        [1, 2],
+        [0, 1],
+        [0, 2],
+        [1, 3],
+        [2, 4],
+        [3, 5],
+        [4, 6],
+    ]
+    id2label = {
+        0: "Nose",
+        1: "L_Eye",
+        2: "R_Eye",
+        3: "L_Ear",
+        4: "R_Ear",
+        5: "L_Shoulder",
+        6: "R_Shoulder",
+        7: "L_Elbow",
+        8: "R_Elbow",
+        9: "L_Wrist",
+        10: "R_Wrist",
+        11: "L_Hip",
+        12: "R_Hip",
+        13: "L_Knee",
+        14: "R_Knee",
+        15: "L_Ankle",
+        16: "R_Ankle",
+    }
+
+    label2id = {v: k for k, v in id2label.items()}
+
+    config = VitPoseConfig(
+        backbone_config=backbone_config,
+        num_labels=17,
+        use_simple_decoder=use_simple_decoder,
+        edges=edges,
+        id2label=id2label,
+        label2id=label2id,
+    )
+
+    return config
+
+
+def convert_old_keys_to_new_keys(state_dict_keys: dict = None):
+    """
+    This function should be applied only once, on the concatenated keys to efficiently rename using
+    the key mappings.
+    """
+    output_dict = {}
+    if state_dict_keys is not None:
+        old_text = "\n".join(state_dict_keys)
+        new_text = old_text
+        for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items():
+            if replacement is None:
+                new_text = re.sub(pattern, "", new_text)  # an empty line
+                continue
+            new_text = re.sub(pattern, replacement, new_text)
+        output_dict = dict(zip(old_text.split("\n"), new_text.split("\n")))
+    return output_dict
+
+
+# We will verify our results on a COCO image
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    return image
+
+
+@torch.no_grad()
+def write_model(model_name, model_path, push_to_hub, check_logits=True):
+    # ------------------------------------------------------------
+    # Vision model params and config
+    # ------------------------------------------------------------
+
+    # params from config
+    config = get_config(model_name)
+
+    # ------------------------------------------------------------
+    # Convert weights
+    # ------------------------------------------------------------
+
+    # load original state_dict
+    filename = MODEL_TO_FILE_NAME_MAPPING[model_name]
+    print(f"Fetching all parameters from the checkpoint at {filename}...")
+
+    checkpoint_path = hf_hub_download(
+        repo_id="nielsr/vitpose-original-checkpoints", filename=filename, repo_type="model"
+    )
+
+    print("Converting model...")
+    original_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["state_dict"]
+    all_keys = list(original_state_dict.keys())
+    new_keys = convert_old_keys_to_new_keys(all_keys)
+
+    dim = config.backbone_config.hidden_size
+
+    state_dict = {}
+    for key in all_keys:
+        new_key = new_keys[key]
+        value = original_state_dict[key]
+
+        if re.search("associate_heads", new_key) or re.search("backbone.cls_token", new_key):
+            # This associated_heads is concept of auxiliary head so does not require in inference stage.
+            # backbone.cls_token is optional forward function for dynamically change of size, see detail in https://github.com/ViTAE-Transformer/ViTPose/issues/34
+            pass
+        elif re.search("qkv", new_key):
+            state_dict[new_key.replace("self.qkv", "attention.query")] = value[:dim]
+            state_dict[new_key.replace("self.qkv", "attention.key")] = value[dim : dim * 2]
+            state_dict[new_key.replace("self.qkv", "attention.value")] = value[-dim:]
+        elif re.search("head", new_key) and not config.use_simple_decoder:
+            # Pattern for deconvolution layers
+            deconv_pattern = r"deconv_layers\.(0|3)\.weight"
+            new_key = re.sub(deconv_pattern, lambda m: f"deconv{int(m.group(1)) // 3 + 1}.weight", new_key)
+            # Pattern for batch normalization layers
+            bn_patterns = [
+                (r"deconv_layers\.(\d+)\.weight", r"batchnorm\1.weight"),
+                (r"deconv_layers\.(\d+)\.bias", r"batchnorm\1.bias"),
+                (r"deconv_layers\.(\d+)\.running_mean", r"batchnorm\1.running_mean"),
+                (r"deconv_layers\.(\d+)\.running_var", r"batchnorm\1.running_var"),
+                (r"deconv_layers\.(\d+)\.num_batches_tracked", r"batchnorm\1.num_batches_tracked"),
+            ]
+
+            for pattern, replacement in bn_patterns:
+                if re.search(pattern, new_key):
+                    # Convert the layer number to the correct batch norm index
+                    layer_num = int(re.search(pattern, key).group(1))
+                    bn_num = layer_num // 3 + 1
+                    new_key = re.sub(pattern, replacement.replace(r"\1", str(bn_num)), new_key)
+            state_dict[new_key] = value
+        else:
+            state_dict[new_key] = value
+
+    print("Loading the checkpoint in a Vitpose model.")
+    model = VitPoseForPoseEstimation(config)
+    model.eval()
+    model.load_state_dict(state_dict)
+    print("Checkpoint loaded successfully.")
+
+    # create image processor
+    image_processor = VitPoseImageProcessor()
+
+    # verify image processor
+    image = prepare_img()
+    boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+    pixel_values = image_processor(images=image, boxes=boxes, return_tensors="pt").pixel_values
+
+    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="vitpose_batch_data.pt", repo_type="dataset")
+    original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True)["img"]
+    # we allow for a small difference in the pixel values due to the original repository using cv2
+    assert torch.allclose(pixel_values, original_pixel_values, atol=1e-1)
+
+    dataset_index = torch.tensor([0])
+
+    with torch.no_grad():
+        print("Shape of original_pixel_values: ", original_pixel_values.shape)
+        print("First values of original_pixel_values: ", original_pixel_values[0, 0, :3, :3])
+
+        # first forward pass
+        outputs = model(original_pixel_values, dataset_index=dataset_index)
+        output_heatmap = outputs.heatmaps
+
+        print("Shape of output_heatmap: ", output_heatmap.shape)
+        print("First values: ", output_heatmap[0, 0, :3, :3])
+
+        # second forward pass (flipped)
+        # this is done since the model uses `flip_test=True` in its test config
+        original_pixel_values_flipped = torch.flip(original_pixel_values, [3])
+        outputs_flipped = model(
+            original_pixel_values_flipped,
+            dataset_index=dataset_index,
+            flip_pairs=torch.tensor([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], [15, 16]]),
+        )
+        output_flipped_heatmap = outputs_flipped.heatmaps
+
+    outputs.heatmaps = (output_heatmap + output_flipped_heatmap) * 0.5
+
+    # Verify pose_results
+    pose_results = image_processor.post_process_pose_estimation(outputs, boxes=boxes)[0]
+
+    if check_logits:
+        # Simple decoder checkpoints
+        if model_name == "vitpose-base-simple":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98180511e02, 1.81808380e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.66642594e-01]),
+                atol=5e-2,
+            )
+        # Classic decoder checkpoints
+        elif model_name == "vitpose-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.9807913e02, 1.8182812e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.8235235e-01]),
+                atol=5e-2,
+            )
+        # COCO-AIC-MPII checkpoints
+        elif model_name == "vitpose-base-coco-aic-mpii":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98305542e02, 1.81741592e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.69966745e-01]),
+                atol=5e-2,
+            )
+        # VitPose+ models
+        elif model_name == "vitpose-plus-small":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([398.1597, 181.6902]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor(0.9051),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-plus-base":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([3.98201294e02, 1.81728302e02]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor([8.75046968e-01]),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-plus-large":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([398.1409, 181.7412]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor(0.8746),
+                atol=5e-2,
+            )
+        elif model_name == "vitpose-plus-huge":
+            assert torch.allclose(
+                pose_results[1]["keypoints"][0],
+                torch.tensor([398.2079, 181.8026]),
+                atol=5e-2,
+            )
+            assert torch.allclose(
+                pose_results[1]["scores"][0],
+                torch.tensor(0.8693),
+                atol=5e-2,
+            )
+        else:
+            raise ValueError("Model not supported")
+    print("Conversion successfully done.")
+
+    if model_path is not None:
+        os.makedirs(model_path, exist_ok=True)
+        model.save_pretrained(model_path)
+        image_processor.save_pretrained(model_path)
+
+    if push_to_hub:
+        print(f"Pushing model and image processor for {model_name} to hub")
+        # we created a community organization on the hub for this model
+        # maintained by the Transformers team
+        model.push_to_hub(f"usyd-community/{model_name}")
+        image_processor.push_to_hub(f"usyd-community/{model_name}")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="vitpose-base-simple",
+        choices=MODEL_TO_FILE_NAME_MAPPING.keys(),
+        type=str,
+        help="Name of the VitPose model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to store the converted model."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+    parser.add_argument(
+        "--check_logits", action="store_false", help="Whether or not to verify the logits of the converted model."
+    )
+
+    args = parser.parse_args()
+    write_model(
+        model_path=args.pytorch_dump_folder_path,
+        model_name=args.model_name,
+        push_to_hub=args.push_to_hub,
+        check_logits=args.check_logits,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/transformers/build/lib/transformers/models/vitpose/image_processing_vitpose.py b/docs/transformers/build/lib/transformers/models/vitpose/image_processing_vitpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..387b7225473b61c1b505c3af617a4cd419887eb9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose/image_processing_vitpose.py
@@ -0,0 +1,684 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for VitPose."""
+
+import itertools
+import math
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_scipy_available, is_torch_available, is_vision_available, logging
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    import PIL
+
+if is_scipy_available():
+    from scipy.linalg import inv
+    from scipy.ndimage import affine_transform, gaussian_filter
+
+if TYPE_CHECKING:
+    from .modeling_vitpose import VitPoseEstimatorOutput
+
+logger = logging.get_logger(__name__)
+
+
+# inspired by https://github.com/ViTAE-Transformer/ViTPose/blob/d5216452796c90c6bc29f5c5ec0bdba94366768a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_img_top_down_dataset.py#L132
+def box_to_center_and_scale(
+    box: Union[Tuple, List, np.ndarray],
+    image_width: int,
+    image_height: int,
+    normalize_factor: float = 200.0,
+    padding_factor: float = 1.25,
+):
+    """
+    Encodes a bounding box in COCO format into (center, scale).
+
+    Args:
+        box (`Tuple`, `List`, or `np.ndarray`):
+            Bounding box in COCO format (top_left_x, top_left_y, width, height).
+        image_width (`int`):
+            Image width.
+        image_height (`int`):
+            Image height.
+        normalize_factor (`float`):
+            Width and height scale factor.
+        padding_factor (`float`):
+            Bounding box padding factor.
+
+    Returns:
+        tuple: A tuple containing center and scale.
+
+        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
+        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
+    """
+
+    top_left_x, top_left_y, width, height = box[:4]
+    aspect_ratio = image_width / image_height
+    center = np.array([top_left_x + width * 0.5, top_left_y + height * 0.5], dtype=np.float32)
+
+    if width > aspect_ratio * height:
+        height = width * 1.0 / aspect_ratio
+    elif width < aspect_ratio * height:
+        width = height * aspect_ratio
+
+    scale = np.array([width / normalize_factor, height / normalize_factor], dtype=np.float32)
+    scale = scale * padding_factor
+
+    return center, scale
+
+
+def coco_to_pascal_voc(bboxes: np.ndarray) -> np.ndarray:
+    """
+    Converts bounding boxes from the COCO format to the Pascal VOC format.
+
+    In other words, converts from (top_left_x, top_left_y, width, height) format
+    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).
+
+    Args:
+        bboxes (`np.ndarray` of shape `(batch_size, 4)):
+            Bounding boxes in COCO format.
+
+    Returns:
+        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
+    """
+    bboxes[:, 2] = bboxes[:, 2] + bboxes[:, 0] - 1
+    bboxes[:, 3] = bboxes[:, 3] + bboxes[:, 1] - 1
+
+    return bboxes
+
+
+def get_keypoint_predictions(heatmaps: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get keypoint predictions from score maps.
+
+    Args:
+        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Model predicted heatmaps.
+
+    Returns:
+        tuple: A tuple containing aggregated results.
+
+        - coords (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
+            Predicted keypoint location.
+        - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
+            Scores (confidence) of the keypoints.
+    """
+    if not isinstance(heatmaps, np.ndarray):
+        raise ValueError("Heatmaps should be np.ndarray")
+    if heatmaps.ndim != 4:
+        raise ValueError("Heatmaps should be 4-dimensional")
+
+    batch_size, num_keypoints, _, width = heatmaps.shape
+    heatmaps_reshaped = heatmaps.reshape((batch_size, num_keypoints, -1))
+    idx = np.argmax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+    scores = np.amax(heatmaps_reshaped, 2).reshape((batch_size, num_keypoints, 1))
+
+    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+    preds[:, :, 0] = preds[:, :, 0] % width
+    preds[:, :, 1] = preds[:, :, 1] // width
+
+    preds = np.where(np.tile(scores, (1, 1, 2)) > 0.0, preds, -1)
+    return preds, scores
+
+
+def post_dark_unbiased_data_processing(coords: np.ndarray, batch_heatmaps: np.ndarray, kernel: int = 3) -> np.ndarray:
+    """DARK post-pocessing. Implemented by unbiased_data_processing.
+
+    Paper references:
+    - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+    - Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).
+
+    Args:
+        coords (`np.ndarray` of shape `(num_persons, num_keypoints, 2)`):
+            Initial coordinates of human pose.
+        batch_heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
+            Batched heatmaps as predicted by the model.
+            A batch_size of 1 is used for the bottom up paradigm where all persons share the same heatmap.
+            A batch_size of `num_persons` is used for the top down paradigm where each person has its own heatmaps.
+        kernel (`int`, *optional*, defaults to 3):
+            Gaussian kernel size (K) for modulation.
+
+    Returns:
+        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
+            Refined coordinates.
+    """
+    batch_size, num_keypoints, height, width = batch_heatmaps.shape
+    num_coords = coords.shape[0]
+    if not (batch_size == 1 or batch_size == num_coords):
+        raise ValueError("The batch size of heatmaps should be 1 or equal to the batch size of coordinates.")
+    radius = int((kernel - 1) // 2)
+    batch_heatmaps = np.array(
+        [
+            [gaussian_filter(heatmap, sigma=0.8, radius=(radius, radius), axes=(0, 1)) for heatmap in heatmaps]
+            for heatmaps in batch_heatmaps
+        ]
+    )
+    batch_heatmaps = np.clip(batch_heatmaps, 0.001, 50)
+    batch_heatmaps = np.log(batch_heatmaps)
+
+    batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), (1, 1)), mode="edge").flatten()
+
+    # calculate indices for coordinates
+    index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (width + 2)
+    index += (width + 2) * (height + 2) * np.arange(0, batch_size * num_keypoints).reshape(-1, num_keypoints)
+    index = index.astype(int).reshape(-1, 1)
+    i_ = batch_heatmaps_pad[index]
+    ix1 = batch_heatmaps_pad[index + 1]
+    iy1 = batch_heatmaps_pad[index + width + 2]
+    ix1y1 = batch_heatmaps_pad[index + width + 3]
+    ix1_y1_ = batch_heatmaps_pad[index - width - 3]
+    ix1_ = batch_heatmaps_pad[index - 1]
+    iy1_ = batch_heatmaps_pad[index - 2 - width]
+
+    # calculate refined coordinates using Newton's method
+    dx = 0.5 * (ix1 - ix1_)
+    dy = 0.5 * (iy1 - iy1_)
+    derivative = np.concatenate([dx, dy], axis=1)
+    derivative = derivative.reshape(num_coords, num_keypoints, 2, 1)
+    dxx = ix1 - 2 * i_ + ix1_
+    dyy = iy1 - 2 * i_ + iy1_
+    dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+    hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+    hessian = hessian.reshape(num_coords, num_keypoints, 2, 2)
+    hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+    coords -= np.einsum("ijmn,ijnk->ijmk", hessian, derivative).squeeze()
+    return coords
+
+
+def transform_preds(coords: np.ndarray, center: np.ndarray, scale: np.ndarray, output_size: np.ndarray) -> np.ndarray:
+    """Get final keypoint predictions from heatmaps and apply scaling and
+    translation to map them back to the image.
+
+    Note:
+        num_keypoints: K
+
+    Args:
+        coords (`np.ndarray` of shape `(num_keypoints, ndims)`):
+
+            * If ndims=2, corrds are predicted keypoint location.
+            * If ndims=4, corrds are composed of (x, y, scores, tags)
+            * If ndims=5, corrds are composed of (x, y, scores, tags,
+              flipped_tags)
+
+        center (`np.ndarray` of shape `(2,)`):
+            Center of the bounding box (x, y).
+        scale (`np.ndarray` of shape `(2,)`):
+            Scale of the bounding box wrt original image of width and height.
+        output_size (`np.ndarray` of shape `(2,)`):
+            Size of the destination heatmaps in (height, width) format.
+
+    Returns:
+        np.ndarray: Predicted coordinates in the images.
+    """
+    if coords.shape[1] not in (2, 4, 5):
+        raise ValueError("Coordinates need to have either 2, 4 or 5 dimensions.")
+    if len(center) != 2:
+        raise ValueError("Center needs to have 2 elements, one for x and one for y.")
+    if len(scale) != 2:
+        raise ValueError("Scale needs to consist of a width and height")
+    if len(output_size) != 2:
+        raise ValueError("Output size needs to consist of a height and width")
+
+    # Recover the scale which is normalized by a factor of 200.
+    scale = scale * 200.0
+
+    # We use unbiased data processing
+    scale_y = scale[1] / (output_size[0] - 1.0)
+    scale_x = scale[0] / (output_size[1] - 1.0)
+
+    target_coords = np.ones_like(coords)
+    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
+    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5
+
+    return target_coords
+
+
+def get_warp_matrix(theta: float, size_input: np.ndarray, size_dst: np.ndarray, size_target: np.ndarray):
+    """
+    Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
+    Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
+
+    Args:
+        theta (`float`):
+            Rotation angle in degrees.
+        size_input (`np.ndarray`):
+            Size of input image [width, height].
+        size_dst (`np.ndarray`):
+            Size of output image [width, height].
+        size_target (`np.ndarray`):
+            Size of ROI in input plane [w, h].
+
+    Returns:
+        `np.ndarray`: A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (
+        -0.5 * size_input[0] * math.cos(theta) + 0.5 * size_input[1] * math.sin(theta) + 0.5 * size_target[0]
+    )
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (
+        -0.5 * size_input[0] * math.sin(theta) - 0.5 * size_input[1] * math.cos(theta) + 0.5 * size_target[1]
+    )
+    return matrix
+
+
+def scipy_warp_affine(src, M, size):
+    """
+    This function implements cv2.warpAffine function using affine_transform in scipy. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.affine_transform.html and https://docs.opencv.org/4.x/d4/d61/tutorial_warp_affine.html for more details.
+
+    Note: the original implementation of cv2.warpAffine uses cv2.INTER_LINEAR.
+    """
+    channels = [src[..., i] for i in range(src.shape[-1])]
+
+    # Convert to a 3x3 matrix used by SciPy
+    M_scipy = np.vstack([M, [0, 0, 1]])
+    # If you have a matrix for the ‘push’ transformation, use its inverse (numpy.linalg.inv) in this function.
+    M_inv = inv(M_scipy)
+    M_inv[0, 0], M_inv[0, 1], M_inv[1, 0], M_inv[1, 1], M_inv[0, 2], M_inv[1, 2] = (
+        M_inv[1, 1],
+        M_inv[1, 0],
+        M_inv[0, 1],
+        M_inv[0, 0],
+        M_inv[1, 2],
+        M_inv[0, 2],
+    )
+
+    new_src = [affine_transform(channel, M_inv, output_shape=size, order=1) for channel in channels]
+    new_src = np.stack(new_src, axis=-1)
+    return new_src
+
+
+class VitPoseImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VitPose image processor.
+
+    Args:
+        do_affine_transform (`bool`, *optional*, defaults to `True`):
+            Whether to apply an affine transformation to the input images.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 192}`):
+            Resolution of the image after `affine_transform` is applied. Only has an effect if `do_affine_transform` is set to `True`. Can
+            be overriden by `size` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`, *optional*):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`, *optional*):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_affine_transform: bool = True,
+        size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.do_affine_transform = do_affine_transform
+        self.size = size if size is not None else {"height": 256, "width": 192}
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.normalize_factor = 200.0
+
+    def affine_transform(
+        self,
+        image: np.array,
+        center: Tuple[float],
+        scale: Tuple[float],
+        rotation: float,
+        size: Dict[str, int],
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.array:
+        """
+        Apply an affine transformation to an image.
+
+        Args:
+            image (`np.array`):
+                Image to transform.
+            center (`Tuple[float]`):
+                Center of the bounding box (x, y).
+            scale (`Tuple[float]`):
+                Scale of the bounding box with respect to height/width.
+            rotation (`float`):
+                Rotation angle in degrees.
+            size (`Dict[str, int]`):
+                Size of the destination image.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format of the output image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image.
+        """
+
+        data_format = input_data_format if data_format is None else data_format
+
+        size = (size["width"], size["height"])
+
+        # one uses a pixel standard deviation of 200 pixels
+        transformation = get_warp_matrix(rotation, center * 2.0, np.array(size) - 1.0, scale * 200.0)
+
+        # input image requires channels last format
+        image = (
+            image
+            if input_data_format == ChannelDimension.LAST
+            else to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format)
+        )
+        image = scipy_warp_affine(src=image, M=transformation, size=(size[1], size[0]))
+
+        image = to_channel_dimension_format(image, data_format, ChannelDimension.LAST)
+
+        return image
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        boxes: Union[List[List[float]], np.ndarray],
+        do_affine_transform: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
+
+            do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
+                Whether to apply an affine transformation to the input images.
+            size (`Dict[str, int]` *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+        """
+        do_affine_transform = do_affine_transform if do_affine_transform is not None else self.do_affine_transform
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if isinstance(boxes, list) and len(images) != len(boxes):
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {len(boxes)}")
+        elif isinstance(boxes, np.ndarray) and len(images) != boxes.shape[0]:
+            raise ValueError(f"Batch of images and boxes mismatch : {len(images)} != {boxes.shape[0]}")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # transformations (affine transformation + rescaling + normalization)
+        if self.do_affine_transform:
+            new_images = []
+            for image, image_boxes in zip(images, boxes):
+                for box in image_boxes:
+                    center, scale = box_to_center_and_scale(
+                        box,
+                        image_width=size["width"],
+                        image_height=size["height"],
+                        normalize_factor=self.normalize_factor,
+                    )
+                    transformed_image = self.affine_transform(
+                        image, center, scale, rotation=0, size=size, input_data_format=input_data_format
+                    )
+                    new_images.append(transformed_image)
+            images = new_images
+
+        # For batch processing, the number of boxes must be consistent across all images in the batch.
+        # When using a list input, the number of boxes can vary dynamically per image.
+        # The image processor creates pixel_values of shape (batch_size*num_persons, num_channels, height, width)
+
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
+
+    def keypoints_from_heatmaps(
+        self,
+        heatmaps: np.ndarray,
+        center: np.ndarray,
+        scale: np.ndarray,
+        kernel: int = 11,
+    ):
+        """
+        Get final keypoint predictions from heatmaps and transform them back to
+        the image.
+
+        Args:
+            heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width])`):
+                Model predicted heatmaps.
+            center (`np.ndarray` of shape `(batch_size, 2)`):
+                Center of the bounding box (x, y).
+            scale (`np.ndarray` of shape `(batch_size, 2)`):
+                Scale of the bounding box wrt original images of width and height.
+            kernel (int, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
+                K=17 for sigma=3 and k=11 for sigma=2.
+
+        Returns:
+            tuple: A tuple containing keypoint predictions and scores.
+
+            - preds (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
+                Predicted keypoint location in images.
+            - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
+                Scores (confidence) of the keypoints.
+        """
+        batch_size, _, height, width = heatmaps.shape
+
+        coords, scores = get_keypoint_predictions(heatmaps)
+
+        preds = post_dark_unbiased_data_processing(coords, heatmaps, kernel=kernel)
+
+        # Transform back to the image
+        for i in range(batch_size):
+            preds[i] = transform_preds(preds[i], center=center[i], scale=scale[i], output_size=[height, width])
+
+        return preds, scores
+
+    def post_process_pose_estimation(
+        self,
+        outputs: "VitPoseEstimatorOutput",
+        boxes: Union[List[List[List[float]]], np.ndarray],
+        kernel_size: int = 11,
+        threshold: Optional[float] = None,
+        target_sizes: Union[TensorType, List[Tuple]] = None,
+    ):
+        """
+        Transform the heatmaps into keypoint predictions and transform them back to the image.
+
+        Args:
+            outputs (`VitPoseEstimatorOutput`):
+                VitPoseForPoseEstimation model outputs.
+            boxes (`List[List[List[float]]]` or `np.ndarray`):
+                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
+                box coordinates in COCO format (top_left_x, top_left_y, width, height).
+            kernel_size (`int`, *optional*, defaults to 11):
+                Gaussian kernel size (K) for modulation.
+            threshold (`float`, *optional*, defaults to None):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
+        Returns:
+            `List[List[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
+            in the batch as predicted by the model.
+        """
+
+        # First compute centers and scales for each bounding box
+        batch_size, num_keypoints, _, _ = outputs.heatmaps.shape
+
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        centers = np.zeros((batch_size, 2), dtype=np.float32)
+        scales = np.zeros((batch_size, 2), dtype=np.float32)
+        flattened_boxes = list(itertools.chain(*boxes))
+        for i in range(batch_size):
+            if target_sizes is not None:
+                image_width, image_height = target_sizes[i][0], target_sizes[i][1]
+                scale_factor = np.array([image_width, image_height, image_width, image_height])
+                flattened_boxes[i] = flattened_boxes[i] * scale_factor
+            width, height = self.size["width"], self.size["height"]
+            center, scale = box_to_center_and_scale(flattened_boxes[i], image_width=width, image_height=height)
+            centers[i, :] = center
+            scales[i, :] = scale
+
+        preds, scores = self.keypoints_from_heatmaps(
+            outputs.heatmaps.cpu().numpy(), centers, scales, kernel=kernel_size
+        )
+
+        all_boxes = np.zeros((batch_size, 4), dtype=np.float32)
+        all_boxes[:, 0:2] = centers[:, 0:2]
+        all_boxes[:, 2:4] = scales[:, 0:2]
+
+        poses = torch.tensor(preds)
+        scores = torch.tensor(scores)
+        labels = torch.arange(0, num_keypoints)
+        bboxes_xyxy = torch.tensor(coco_to_pascal_voc(all_boxes))
+
+        results: List[List[Dict[str, torch.Tensor]]] = []
+
+        pose_bbox_pairs = zip(poses, scores, bboxes_xyxy)
+
+        for image_bboxes in boxes:
+            image_results: List[Dict[str, torch.Tensor]] = []
+            for _ in image_bboxes:
+                # Unpack the next pose and bbox_xyxy from the iterator
+                pose, score, bbox_xyxy = next(pose_bbox_pairs)
+                score = score.squeeze()
+                keypoints_labels = labels
+                if threshold is not None:
+                    keep = score > threshold
+                    pose = pose[keep]
+                    score = score[keep]
+                    keypoints_labels = keypoints_labels[keep]
+                pose_result = {"keypoints": pose, "scores": score, "labels": keypoints_labels, "bbox": bbox_xyxy}
+                image_results.append(pose_result)
+            results.append(image_results)
+
+        return results
+
+
+__all__ = ["VitPoseImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/vitpose/modeling_vitpose.py b/docs/transformers/build/lib/transformers/models/vitpose/modeling_vitpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe9738abf572d8d6224fb2232059794e4f4a6a1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose/modeling_vitpose.py
@@ -0,0 +1,340 @@
+# coding=utf-8
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VitPose model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_vitpose import VitPoseConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitPoseConfig"
+
+
+@dataclass
+class VitPoseEstimatorOutput(ModelOutput):
+    """
+    Class for outputs of pose estimation models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
+        heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
+            Heatmaps as predicted by the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+            (also called feature maps) of the model at the output of each stage.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    heatmaps: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class VitPosePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitPoseConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+VITPOSE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitPoseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VitPoseImageProcessor`]. See
+            [`VitPoseImageProcessor.__call__`] for details.
+
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
+
+            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
+
+        flip_pairs (`torch.tensor`, *optional*):
+            Whether to mirror pairs of keypoints (for example, left ear -- right ear).
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def flip_back(output_flipped, flip_pairs, target_type="gaussian-heatmap"):
+    """Flip the flipped heatmaps back to the original form.
+
+    Args:
+        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
+            The output heatmaps obtained from the flipped images.
+        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
+            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
+        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
+            Target type to use. Can be gaussian-heatmap or combined-target.
+            gaussian-heatmap: Classification target with gaussian distribution.
+            combined-target: The combination of classification target (response map) and regression target (offset map).
+            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
+
+    Returns:
+        torch.Tensor: heatmaps that flipped back to the original image
+    """
+    if target_type not in ["gaussian-heatmap", "combined-target"]:
+        raise ValueError("target_type should be gaussian-heatmap or combined-target")
+
+    if output_flipped.ndim != 4:
+        raise ValueError("output_flipped should be [batch_size, num_keypoints, height, width]")
+    batch_size, num_keypoints, height, width = output_flipped.shape
+    channels = 1
+    if target_type == "combined-target":
+        channels = 3
+        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
+    output_flipped = output_flipped.reshape(batch_size, -1, channels, height, width)
+    output_flipped_back = output_flipped.clone()
+
+    # Swap left-right parts
+    for left, right in flip_pairs.tolist():
+        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
+        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
+    output_flipped_back = output_flipped_back.reshape((batch_size, num_keypoints, height, width))
+    # Flip horizontally
+    output_flipped_back = output_flipped_back.flip(-1)
+    return output_flipped_back
+
+
+class VitPoseSimpleDecoder(nn.Module):
+    """
+    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
+    feature maps into heatmaps.
+    """
+
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        self.activation = nn.ReLU()
+        self.upsampling = nn.Upsample(scale_factor=config.scale_factor, mode="bilinear", align_corners=False)
+        self.conv = nn.Conv2d(
+            config.backbone_config.hidden_size, config.num_labels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # Transform input: ReLU + upsample
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.upsampling(hidden_state)
+        heatmaps = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps, flip_pairs)
+
+        return heatmaps
+
+
+class VitPoseClassicDecoder(nn.Module):
+    """
+    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
+    turning the feature maps into heatmaps.
+    """
+
+    def __init__(self, config: VitPoseConfig):
+        super().__init__()
+
+        self.deconv1 = nn.ConvTranspose2d(
+            config.backbone_config.hidden_size, 256, kernel_size=4, stride=2, padding=1, bias=False
+        )
+        self.batchnorm1 = nn.BatchNorm2d(256)
+        self.relu1 = nn.ReLU()
+
+        self.deconv2 = nn.ConvTranspose2d(256, 256, kernel_size=4, stride=2, padding=1, bias=False)
+        self.batchnorm2 = nn.BatchNorm2d(256)
+        self.relu2 = nn.ReLU()
+
+        self.conv = nn.Conv2d(256, config.num_labels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_state: torch.Tensor, flip_pairs: Optional[torch.Tensor] = None):
+        hidden_state = self.deconv1(hidden_state)
+        hidden_state = self.batchnorm1(hidden_state)
+        hidden_state = self.relu1(hidden_state)
+
+        hidden_state = self.deconv2(hidden_state)
+        hidden_state = self.batchnorm2(hidden_state)
+        hidden_state = self.relu2(hidden_state)
+
+        heatmaps = self.conv(hidden_state)
+
+        if flip_pairs is not None:
+            heatmaps = flip_back(heatmaps, flip_pairs)
+
+        return heatmaps
+
+
+@add_start_docstrings(
+    "The VitPose model with a pose estimation head on top.",
+    VITPOSE_START_DOCSTRING,
+)
+class VitPoseForPoseEstimation(VitPosePreTrainedModel):
+    def __init__(self, config: VitPoseConfig) -> None:
+        super().__init__(config)
+
+        self.backbone = load_backbone(config)
+
+        # add backbone attributes
+        if not hasattr(self.backbone.config, "hidden_size"):
+            raise ValueError("The backbone should have a hidden_size attribute")
+        if not hasattr(self.backbone.config, "image_size"):
+            raise ValueError("The backbone should have an image_size attribute")
+        if not hasattr(self.backbone.config, "patch_size"):
+            raise ValueError("The backbone should have a patch_size attribute")
+
+        self.head = VitPoseSimpleDecoder(config) if config.use_simple_decoder else VitPoseClassicDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITPOSE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VitPoseEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        flip_pairs: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, VitPoseEstimatorOutput]:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
+        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> heatmaps = outputs.heatmaps
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not yet supported")
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values,
+            dataset_index=dataset_index,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        # Turn output hidden states in tensor of shape (batch_size, num_channels, height, width)
+        sequence_output = outputs.feature_maps[-1] if return_dict else outputs[0][-1]
+        batch_size = sequence_output.shape[0]
+        patch_height = self.config.backbone_config.image_size[0] // self.config.backbone_config.patch_size[0]
+        patch_width = self.config.backbone_config.image_size[1] // self.config.backbone_config.patch_size[1]
+        sequence_output = (
+            sequence_output.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width).contiguous()
+        )
+
+        heatmaps = self.head(sequence_output, flip_pairs=flip_pairs)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (heatmaps,) + outputs[1:]
+            else:
+                output = (heatmaps,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return VitPoseEstimatorOutput(
+            loss=loss,
+            heatmaps=heatmaps,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VitPosePreTrainedModel", "VitPoseForPoseEstimation"]
diff --git a/docs/transformers/build/lib/transformers/models/vitpose_backbone/__init__.py b/docs/transformers/build/lib/transformers/models/vitpose_backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..858e93797d26948970f209e6429e4a586a5044a6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose_backbone/__init__.py
@@ -0,0 +1,17 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vitpose_backbone import *
+    from .modeling_vitpose_backbone import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py b/docs/transformers/build/lib/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..439e596273dbc3739b0b8f1cf7621e427571bc4c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose_backbone/configuration_vitpose_backbone.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VitPose backbone configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitPoseBackboneConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitPoseBackbone`]. It is used to instantiate a
+    VitPose model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VitPose
+    [usyd-community/vitpose-base-simple](https://huggingface.co/usyd-community/vitpose-base-simple) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to `[256, 192]`):
+            The size (resolution) of each image.
+        patch_size (`List[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            The ratio of the hidden size in the feedforward network to the hidden size in the attention layers.
+        num_experts (`int`, *optional*, defaults to 1):
+            The number of experts in the MoE layer.
+        part_features (`int`, *optional*):
+            The number of part features to output. Only used in case `num_experts` is greater than 1.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+
+    >>> # Initializing a VitPose configuration
+    >>> configuration = VitPoseBackboneConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = VitPoseBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vitpose_backbone"
+
+    def __init__(
+        self,
+        image_size=[256, 192],
+        patch_size=[16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        num_experts=1,
+        part_features=256,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        qkv_bias=True,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.num_experts = num_experts
+        self.part_features = part_features
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["VitPoseBackboneConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py b/docs/transformers/build/lib/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..8abce8dfdc3d84bc35ea3ea0a9b4575aa6e57312
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vitpose_backbone/modeling_vitpose_backbone.py
@@ -0,0 +1,579 @@
+# coding=utf-8
+# Copyright 2024 University of Sydney and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VitPose backbone model.
+
+This code is the same as the original Vision Transformer (ViT) with 2 modifications:
+- use of padding=2 in the patch embedding layer
+- addition of a mixture-of-experts MLP layer
+"""
+
+import collections.abc
+from typing import Callable, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_vitpose_backbone import VitPoseBackboneConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VitPoseBackboneConfig"
+
+
+class VitPoseBackbonePatchEmbeddings(nn.Module):
+    """Image to Patch Embedding."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        image_size = config.image_size
+        patch_size = config.patch_size
+        num_channels = config.num_channels
+        embed_dim = config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size, padding=2)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        height, width = pixel_values.shape[-2:]
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values)
+
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class VitPoseBackboneEmbeddings(nn.Module):
+    """
+    Construct the position and patch embeddings.
+    """
+
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+
+        self.patch_embeddings = VitPoseBackbonePatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        embeddings = self.patch_embeddings(pixel_values)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.position_embeddings[:, 1:] + self.position_embeddings[:, :1]
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->VitPoseBackbone
+class VitPoseBackboneSelfAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VitPoseBackbone
+class VitPoseBackboneSelfOutput(nn.Module):
+    """
+    The residual connection is defined in VitPoseBackboneLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VitPoseBackbone
+class VitPoseBackboneAttention(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.attention = VitPoseBackboneSelfAttention(config)
+        self.output = VitPoseBackboneSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VitPoseBackboneMoeMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig):
+        super().__init__()
+
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+
+        num_experts = config.num_experts
+        part_features = config.part_features
+
+        self.part_features = part_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features - part_features)
+        self.drop = nn.Dropout(config.hidden_dropout_prob)
+
+        self.num_experts = num_experts
+        experts = [nn.Linear(hidden_features, part_features) for _ in range(num_experts)]
+        self.experts = nn.ModuleList(experts)
+
+    def forward(self, hidden_state: torch.Tensor, indices: torch.Tensor) -> torch.Tensor:
+        expert_hidden_state = torch.zeros_like(hidden_state[:, :, -self.part_features :])
+
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        shared_hidden_state = self.fc2(hidden_state)
+        indices = indices.view(-1, 1, 1)
+
+        # to support ddp training
+        for i in range(self.num_experts):
+            selected_index = indices == i
+            current_hidden_state = self.experts[i](hidden_state) * selected_index
+            expert_hidden_state = expert_hidden_state + current_hidden_state
+
+        hidden_state = torch.cat([shared_hidden_state, expert_hidden_state], dim=-1)
+
+        return hidden_state
+
+
+class VitPoseBackboneMLP(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        self.activation = ACT2FN[config.hidden_act]
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class VitPoseBackboneLayer(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.attention = VitPoseBackboneAttention(config)
+        self.mlp = VitPoseBackboneMLP(config) if self.num_experts == 1 else VitPoseBackboneMoeMLP(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        # Validate dataset_index when using multiple experts
+        if self.num_experts > 1 and dataset_index is None:
+            raise ValueError(
+                "dataset_index must be provided when using multiple experts "
+                f"(num_experts={self.num_experts}). Please provide dataset_index "
+                "to the forward pass."
+            )
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in VitPoseBackbone, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        layer_output = self.layernorm_after(hidden_states)
+        if self.num_experts == 1:
+            layer_output = self.mlp(layer_output)
+        else:
+            layer_output = self.mlp(layer_output, indices=dataset_index)
+
+        # second residual connection
+        layer_output = layer_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VitPoseBackbone
+class VitPoseBackboneEncoder(nn.Module):
+    def __init__(self, config: VitPoseBackboneConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VitPoseBackboneLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    dataset_index,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, dataset_index, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VitPoseBackbonePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitPoseBackboneConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["VitPoseBackboneEmbeddings", "VitPoseBackboneLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm, VitPoseBackboneEmbeddings]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, VitPoseBackboneEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+
+VITPOSE_BACKBONE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VitPoseBackboneConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VITPOSE_BACKBONE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values.
+
+        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
+            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.
+
+            This corresponds to the dataset index used during training, e.g. index 0 refers to COCO.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The VitPose backbone useful for downstream tasks.",
+    VITPOSE_BACKBONE_START_DOCSTRING,
+)
+class VitPoseBackbone(VitPoseBackbonePreTrainedModel, BackboneMixin):
+    def __init__(self, config: VitPoseBackboneConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = VitPoseBackboneEmbeddings(config)
+        self.encoder = VitPoseBackboneEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VITPOSE_BACKBONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        dataset_index: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+        >>> import torch
+
+        >>> config = VitPoseBackboneConfig(out_indices=[-1])
+        >>> model = VitPoseBackbone(config)
+
+        >>> pixel_values = torch.randn(1, 3, 256, 192)
+        >>> dataset_index = torch.tensor([1])
+        >>> outputs = model(pixel_values, dataset_index)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            dataset_index=dataset_index,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                hidden_state = self.layernorm(hidden_state)
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VitPoseBackbonePreTrainedModel", "VitPoseBackbone"]
diff --git a/docs/transformers/build/lib/transformers/models/vits/__init__.py b/docs/transformers/build/lib/transformers/models/vits/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf54b7884848784ac7e94062be8da91f4fedcf7
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vits/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vits import *
+    from .modeling_vits import *
+    from .tokenization_vits import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vits/configuration_vits.py b/docs/transformers/build/lib/transformers/models/vits/configuration_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de2591b0f3addd9536e0ed81023251666764163
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vits/configuration_vits.py
@@ -0,0 +1,253 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VITS model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VitsConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VitsModel`]. It is used to instantiate a VITS
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the VITS
+    [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 38):
+            Vocabulary size of the VITS model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed to the forward method of [`VitsModel`].
+        hidden_size (`int`, *optional*, defaults to 192):
+            Dimensionality of the text encoder layers.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 4):
+            Window size for the relative positional embeddings in the attention layers of the Transformer encoder.
+        use_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the key, query, value projection layers in the Transformer encoder.
+        ffn_dim (`int`, *optional*, defaults to 768):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        ffn_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the 1D convolution layers used by the feed-forward network in the Transformer encoder.
+        flow_size (`int`, *optional*, defaults to 192):
+            Dimensionality of the flow layers.
+        spectrogram_bins (`int`, *optional*, defaults to 513):
+            Number of frequency bins in the target spectrogram.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_stochastic_duration_prediction (`bool`, *optional*, defaults to `True`):
+            Whether to use the stochastic duration prediction module or the regular duration predictor.
+        num_speakers (`int`, *optional*, defaults to 1):
+            Number of speakers if this is a multi-speaker model.
+        speaker_embedding_size (`int`, *optional*, defaults to 0):
+            Number of channels used by the speaker embeddings. Is zero for single-speaker models.
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the HiFi-GAN upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the HiFi-GAN upsampling network.
+            The length of `upsample_rates` defines the number of convolutional layers and has to match the length of
+            `upsample_kernel_sizes`.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the HiFi-GAN upsampling
+            network. The length of `upsample_kernel_sizes` defines the number of convolutional layers and has to match
+            the length of `upsample_rates`.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the HiFi-GAN
+            multi-receptive field fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            HiFi-GAN multi-receptive field fusion (MRF) module.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        depth_separable_channels (`int`, *optional*, defaults to 2):
+            Number of channels to use in each depth-separable block.
+        depth_separable_num_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers to use in each depth-separable block.
+        duration_predictor_flow_bins (`int`, *optional*, defaults to 10):
+            Number of channels to map using the unonstrained rational spline in the duration predictor model.
+        duration_predictor_tail_bound (`float`, *optional*, defaults to 5.0):
+            Value of the tail bin boundary when computing the unconstrained rational spline in the duration predictor
+            model.
+        duration_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the 1D convolution layers used in the duration predictor model.
+        duration_predictor_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout ratio for the duration predictor model.
+        duration_predictor_num_flows (`int`, *optional*, defaults to 4):
+            Number of flow stages used by the duration predictor model.
+        duration_predictor_filter_channels (`int`, *optional*, defaults to 256):
+            Number of channels for the convolution layers used in the duration predictor model.
+        prior_encoder_num_flows (`int`, *optional*, defaults to 4):
+            Number of flow stages used by the prior encoder flow model.
+        prior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 4):
+            Number of WaveNet layers used by the prior encoder flow model.
+        posterior_encoder_num_wavenet_layers (`int`, *optional*, defaults to 16):
+            Number of WaveNet layers used by the posterior encoder model.
+        wavenet_kernel_size (`int`, *optional*, defaults to 5):
+            Kernel size of the 1D convolution layers used in the WaveNet model.
+        wavenet_dilation_rate (`int`, *optional*, defaults to 1):
+            Dilation rates of the dilated 1D convolutional layers used in the WaveNet model.
+        wavenet_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the WaveNet layers.
+        speaking_rate (`float`, *optional*, defaults to 1.0):
+            Speaking rate. Larger values give faster synthesised speech.
+        noise_scale (`float`, *optional*, defaults to 0.667):
+            How random the speech prediction is. Larger values create more variation in the predicted speech.
+        noise_scale_duration (`float`, *optional*, defaults to 0.8):
+            How random the duration prediction is. Larger values create more variation in the predicted durations.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio waveform is digitalized expressed in hertz (Hz).
+
+    Example:
+
+    ```python
+    >>> from transformers import VitsModel, VitsConfig
+
+    >>> # Initializing a "facebook/mms-tts-eng" style configuration
+    >>> configuration = VitsConfig()
+
+    >>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
+    >>> model = VitsModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vits"
+
+    def __init__(
+        self,
+        vocab_size=38,
+        hidden_size=192,
+        num_hidden_layers=6,
+        num_attention_heads=2,
+        window_size=4,
+        use_bias=True,
+        ffn_dim=768,
+        layerdrop=0.1,
+        ffn_kernel_size=3,
+        flow_size=192,
+        spectrogram_bins=513,
+        hidden_act="relu",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_stochastic_duration_prediction=True,
+        num_speakers=1,
+        speaker_embedding_size=0,
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        depth_separable_channels=2,
+        depth_separable_num_layers=3,
+        duration_predictor_flow_bins=10,
+        duration_predictor_tail_bound=5.0,
+        duration_predictor_kernel_size=3,
+        duration_predictor_dropout=0.5,
+        duration_predictor_num_flows=4,
+        duration_predictor_filter_channels=256,
+        prior_encoder_num_flows=4,
+        prior_encoder_num_wavenet_layers=4,
+        posterior_encoder_num_wavenet_layers=16,
+        wavenet_kernel_size=5,
+        wavenet_dilation_rate=1,
+        wavenet_dropout=0.0,
+        speaking_rate=1.0,
+        noise_scale=0.667,
+        noise_scale_duration=0.8,
+        sampling_rate=16_000,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.window_size = window_size
+        self.use_bias = use_bias
+        self.ffn_dim = ffn_dim
+        self.layerdrop = layerdrop
+        self.ffn_kernel_size = ffn_kernel_size
+        self.flow_size = flow_size
+        self.spectrogram_bins = spectrogram_bins
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
+        self.num_speakers = num_speakers
+        self.speaker_embedding_size = speaker_embedding_size
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+        self.depth_separable_channels = depth_separable_channels
+        self.depth_separable_num_layers = depth_separable_num_layers
+        self.duration_predictor_flow_bins = duration_predictor_flow_bins
+        self.duration_predictor_tail_bound = duration_predictor_tail_bound
+        self.duration_predictor_kernel_size = duration_predictor_kernel_size
+        self.duration_predictor_dropout = duration_predictor_dropout
+        self.duration_predictor_num_flows = duration_predictor_num_flows
+        self.duration_predictor_filter_channels = duration_predictor_filter_channels
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
+        self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
+        self.wavenet_kernel_size = wavenet_kernel_size
+        self.wavenet_dilation_rate = wavenet_dilation_rate
+        self.wavenet_dropout = wavenet_dropout
+        self.speaking_rate = speaking_rate
+        self.noise_scale = noise_scale
+        self.noise_scale_duration = noise_scale_duration
+        self.sampling_rate = sampling_rate
+
+        if len(upsample_kernel_sizes) != len(upsample_rates):
+            raise ValueError(
+                f"The length of `upsample_kernel_sizes` ({len(upsample_kernel_sizes)}) must match the length of "
+                f"`upsample_rates` ({len(upsample_rates)})"
+            )
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["VitsConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vits/convert_original_checkpoint.py b/docs/transformers/build/lib/transformers/models/vits/convert_original_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f122e86fa54ac75b8051ef13836e38924f57d1a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vits/convert_original_checkpoint.py
@@ -0,0 +1,390 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert VITS checkpoint."""
+
+import argparse
+import json
+import tempfile
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import VitsConfig, VitsModel, VitsTokenizer, logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.vits")
+
+MAPPING_TEXT_ENCODER = {
+    "enc_p.emb": "text_encoder.embed_tokens",
+    "enc_p.encoder.attn_layers.*.conv_k": "text_encoder.encoder.layers.*.attention.k_proj",
+    "enc_p.encoder.attn_layers.*.conv_v": "text_encoder.encoder.layers.*.attention.v_proj",
+    "enc_p.encoder.attn_layers.*.conv_q": "text_encoder.encoder.layers.*.attention.q_proj",
+    "enc_p.encoder.attn_layers.*.conv_o": "text_encoder.encoder.layers.*.attention.out_proj",
+    "enc_p.encoder.attn_layers.*.emb_rel_k": "text_encoder.encoder.layers.*.attention.emb_rel_k",
+    "enc_p.encoder.attn_layers.*.emb_rel_v": "text_encoder.encoder.layers.*.attention.emb_rel_v",
+    "enc_p.encoder.norm_layers_1.*.gamma": "text_encoder.encoder.layers.*.layer_norm.weight",
+    "enc_p.encoder.norm_layers_1.*.beta": "text_encoder.encoder.layers.*.layer_norm.bias",
+    "enc_p.encoder.ffn_layers.*.conv_1": "text_encoder.encoder.layers.*.feed_forward.conv_1",
+    "enc_p.encoder.ffn_layers.*.conv_2": "text_encoder.encoder.layers.*.feed_forward.conv_2",
+    "enc_p.encoder.norm_layers_2.*.gamma": "text_encoder.encoder.layers.*.final_layer_norm.weight",
+    "enc_p.encoder.norm_layers_2.*.beta": "text_encoder.encoder.layers.*.final_layer_norm.bias",
+    "enc_p.proj": "text_encoder.project",
+}
+MAPPING_STOCHASTIC_DURATION_PREDICTOR = {
+    "dp.pre": "duration_predictor.conv_pre",
+    "dp.proj": "duration_predictor.conv_proj",
+    "dp.convs.convs_sep.*": "duration_predictor.conv_dds.convs_dilated.*",
+    "dp.convs.convs_1x1.*": "duration_predictor.conv_dds.convs_pointwise.*",
+    "dp.convs.norms_1.*.gamma": "duration_predictor.conv_dds.norms_1.*.weight",
+    "dp.convs.norms_1.*.beta": "duration_predictor.conv_dds.norms_1.*.bias",
+    "dp.convs.norms_2.*.gamma": "duration_predictor.conv_dds.norms_2.*.weight",
+    "dp.convs.norms_2.*.beta": "duration_predictor.conv_dds.norms_2.*.bias",
+    "dp.flows.0.logs": "duration_predictor.flows.0.log_scale",
+    "dp.flows.0.m": "duration_predictor.flows.0.translate",
+    "dp.flows.*.pre": "duration_predictor.flows.*.conv_pre",
+    "dp.flows.*.proj": "duration_predictor.flows.*.conv_proj",
+    "dp.flows.*.convs.convs_1x1.0": "duration_predictor.flows.*.conv_dds.convs_pointwise.0",
+    "dp.flows.*.convs.convs_1x1.1": "duration_predictor.flows.*.conv_dds.convs_pointwise.1",
+    "dp.flows.*.convs.convs_1x1.2": "duration_predictor.flows.*.conv_dds.convs_pointwise.2",
+    "dp.flows.*.convs.convs_sep.0": "duration_predictor.flows.*.conv_dds.convs_dilated.0",
+    "dp.flows.*.convs.convs_sep.1": "duration_predictor.flows.*.conv_dds.convs_dilated.1",
+    "dp.flows.*.convs.convs_sep.2": "duration_predictor.flows.*.conv_dds.convs_dilated.2",
+    "dp.flows.*.convs.norms_1.0.gamma": "duration_predictor.flows.*.conv_dds.norms_1.0.weight",
+    "dp.flows.*.convs.norms_1.0.beta": "duration_predictor.flows.*.conv_dds.norms_1.0.bias",
+    "dp.flows.*.convs.norms_1.1.gamma": "duration_predictor.flows.*.conv_dds.norms_1.1.weight",
+    "dp.flows.*.convs.norms_1.1.beta": "duration_predictor.flows.*.conv_dds.norms_1.1.bias",
+    "dp.flows.*.convs.norms_1.2.gamma": "duration_predictor.flows.*.conv_dds.norms_1.2.weight",
+    "dp.flows.*.convs.norms_1.2.beta": "duration_predictor.flows.*.conv_dds.norms_1.2.bias",
+    "dp.flows.*.convs.norms_2.0.gamma": "duration_predictor.flows.*.conv_dds.norms_2.0.weight",
+    "dp.flows.*.convs.norms_2.0.beta": "duration_predictor.flows.*.conv_dds.norms_2.0.bias",
+    "dp.flows.*.convs.norms_2.1.gamma": "duration_predictor.flows.*.conv_dds.norms_2.1.weight",
+    "dp.flows.*.convs.norms_2.1.beta": "duration_predictor.flows.*.conv_dds.norms_2.1.bias",
+    "dp.flows.*.convs.norms_2.2.gamma": "duration_predictor.flows.*.conv_dds.norms_2.2.weight",
+    "dp.flows.*.convs.norms_2.2.beta": "duration_predictor.flows.*.conv_dds.norms_2.2.bias",
+    "dp.post_pre": "duration_predictor.post_conv_pre",
+    "dp.post_proj": "duration_predictor.post_conv_proj",
+    "dp.post_convs.convs_sep.*": "duration_predictor.post_conv_dds.convs_dilated.*",
+    "dp.post_convs.convs_1x1.*": "duration_predictor.post_conv_dds.convs_pointwise.*",
+    "dp.post_convs.norms_1.*.gamma": "duration_predictor.post_conv_dds.norms_1.*.weight",
+    "dp.post_convs.norms_1.*.beta": "duration_predictor.post_conv_dds.norms_1.*.bias",
+    "dp.post_convs.norms_2.*.gamma": "duration_predictor.post_conv_dds.norms_2.*.weight",
+    "dp.post_convs.norms_2.*.beta": "duration_predictor.post_conv_dds.norms_2.*.bias",
+    "dp.post_flows.0.logs": "duration_predictor.post_flows.0.log_scale",
+    "dp.post_flows.0.m": "duration_predictor.post_flows.0.translate",
+    "dp.post_flows.*.pre": "duration_predictor.post_flows.*.conv_pre",
+    "dp.post_flows.*.proj": "duration_predictor.post_flows.*.conv_proj",
+    "dp.post_flows.*.convs.convs_1x1.0": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.0",
+    "dp.post_flows.*.convs.convs_1x1.1": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.1",
+    "dp.post_flows.*.convs.convs_1x1.2": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.2",
+    "dp.post_flows.*.convs.convs_sep.0": "duration_predictor.post_flows.*.conv_dds.convs_dilated.0",
+    "dp.post_flows.*.convs.convs_sep.1": "duration_predictor.post_flows.*.conv_dds.convs_dilated.1",
+    "dp.post_flows.*.convs.convs_sep.2": "duration_predictor.post_flows.*.conv_dds.convs_dilated.2",
+    "dp.post_flows.*.convs.norms_1.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.0.weight",
+    "dp.post_flows.*.convs.norms_1.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.0.bias",
+    "dp.post_flows.*.convs.norms_1.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.1.weight",
+    "dp.post_flows.*.convs.norms_1.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.1.bias",
+    "dp.post_flows.*.convs.norms_1.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.2.weight",
+    "dp.post_flows.*.convs.norms_1.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.2.bias",
+    "dp.post_flows.*.convs.norms_2.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.0.weight",
+    "dp.post_flows.*.convs.norms_2.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.0.bias",
+    "dp.post_flows.*.convs.norms_2.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.1.weight",
+    "dp.post_flows.*.convs.norms_2.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.1.bias",
+    "dp.post_flows.*.convs.norms_2.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.2.weight",
+    "dp.post_flows.*.convs.norms_2.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.2.bias",
+    "dp.cond": "duration_predictor.cond",  # num_speakers > 1
+}
+MAPPING_FLOW = {
+    "flow.flows.*.pre": "flow.flows.*.conv_pre",
+    "flow.flows.*.enc.in_layers.0": "flow.flows.*.wavenet.in_layers.0",
+    "flow.flows.*.enc.in_layers.1": "flow.flows.*.wavenet.in_layers.1",
+    "flow.flows.*.enc.in_layers.2": "flow.flows.*.wavenet.in_layers.2",
+    "flow.flows.*.enc.in_layers.3": "flow.flows.*.wavenet.in_layers.3",
+    "flow.flows.*.enc.res_skip_layers.0": "flow.flows.*.wavenet.res_skip_layers.0",
+    "flow.flows.*.enc.res_skip_layers.1": "flow.flows.*.wavenet.res_skip_layers.1",
+    "flow.flows.*.enc.res_skip_layers.2": "flow.flows.*.wavenet.res_skip_layers.2",
+    "flow.flows.*.enc.res_skip_layers.3": "flow.flows.*.wavenet.res_skip_layers.3",
+    "flow.flows.*.enc.cond_layer": "flow.flows.*.wavenet.cond_layer",  # num_speakers > 1
+    "flow.flows.*.post": "flow.flows.*.conv_post",
+}
+MAPPING_GENERATOR = {
+    "dec.conv_pre": "decoder.conv_pre",
+    "dec.ups.0": "decoder.upsampler.0",
+    "dec.ups.1": "decoder.upsampler.1",
+    "dec.ups.2": "decoder.upsampler.2",
+    "dec.ups.3": "decoder.upsampler.3",
+    "dec.resblocks.*.convs1.0": "decoder.resblocks.*.convs1.0",
+    "dec.resblocks.*.convs1.1": "decoder.resblocks.*.convs1.1",
+    "dec.resblocks.*.convs1.2": "decoder.resblocks.*.convs1.2",
+    "dec.resblocks.*.convs2.0": "decoder.resblocks.*.convs2.0",
+    "dec.resblocks.*.convs2.1": "decoder.resblocks.*.convs2.1",
+    "dec.resblocks.*.convs2.2": "decoder.resblocks.*.convs2.2",
+    "dec.conv_post": "decoder.conv_post",
+    "dec.cond": "decoder.cond",  # num_speakers > 1
+}
+MAPPING_POSTERIOR_ENCODER = {
+    "enc_q.pre": "posterior_encoder.conv_pre",
+    "enc_q.enc.in_layers.*": "posterior_encoder.wavenet.in_layers.*",
+    "enc_q.enc.res_skip_layers.*": "posterior_encoder.wavenet.res_skip_layers.*",
+    "enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer",  # num_speakers > 1
+    "enc_q.proj": "posterior_encoder.conv_proj",
+}
+MAPPING = {
+    **MAPPING_TEXT_ENCODER,
+    **MAPPING_STOCHASTIC_DURATION_PREDICTOR,
+    **MAPPING_FLOW,
+    **MAPPING_GENERATOR,
+    **MAPPING_POSTERIOR_ENCODER,
+    "emb_g": "embed_speaker",  # num_speakers > 1
+}
+TOP_LEVEL_KEYS = []
+IGNORE_KEYS = []
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    # strip off the kernel dimension at the end (original weights are Conv1d)
+    if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
+        value = value.squeeze(-1)
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")
+
+
+def should_ignore(name, ignore_keys):
+    for key in ignore_keys:
+        if key.endswith(".*"):
+            if name.startswith(key[:-1]):
+                return True
+        elif ".*." in key:
+            prefix, suffix = key.split(".*.")
+            if prefix in name and suffix in name:
+                return True
+        elif key in name:
+            return True
+    return False
+
+
+def recursively_load_weights(fairseq_dict, hf_model):
+    unused_weights = []
+
+    for name, value in fairseq_dict.items():
+        if should_ignore(name, IGNORE_KEYS):
+            logger.info(f"{name} was ignored")
+            continue
+
+        is_used = False
+        for key, mapped_key in MAPPING.items():
+            if key.endswith(".*"):
+                key = key[:-1]
+            elif "*" in key:
+                prefix, suffix = key.split(".*.")
+                if prefix in name and suffix in name:
+                    key = suffix
+
+            if key in name:
+                is_used = True
+                if mapped_key.endswith(".*"):
+                    layer_index = name.split(key)[-1].split(".")[0]
+                    mapped_key = mapped_key.replace("*", layer_index)
+                elif "*" in mapped_key:
+                    layer_index = name.split(key)[0].split(".")[-2]
+
+                    # remap the layer index since we removed the Flip layers
+                    if "flow.flows" in mapped_key:
+                        layer_index = str(int(layer_index) // 2)
+                    if "duration_predictor.flows" in mapped_key or "duration_predictor.post_flows" in mapped_key:
+                        layer_index = str(int(layer_index) // 2 + 1)
+
+                    mapped_key = mapped_key.replace("*", layer_index)
+                if "weight_g" in name:
+                    weight_type = "weight_g"
+                elif "weight_v" in name:
+                    weight_type = "weight_v"
+                elif "bias" in name:
+                    weight_type = "bias"
+                elif "weight" in name:
+                    weight_type = "weight"
+                elif "running_mean" in name:
+                    weight_type = "running_mean"
+                elif "running_var" in name:
+                    weight_type = "running_var"
+                elif "num_batches_tracked" in name:
+                    weight_type = "num_batches_tracked"
+                else:
+                    weight_type = None
+                set_recursively(hf_model, mapped_key, value, name, weight_type)
+            continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+@torch.no_grad()
+def convert_checkpoint(
+    pytorch_dump_folder_path,
+    checkpoint_path=None,
+    config_path=None,
+    vocab_path=None,
+    language=None,
+    num_speakers=None,
+    sampling_rate=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = VitsConfig.from_pretrained(config_path)
+    else:
+        config = VitsConfig()
+
+    if num_speakers:
+        config.num_speakers = num_speakers
+        config.speaker_embedding_size = 256
+
+    if sampling_rate:
+        config.sampling_rate = sampling_rate
+
+    if checkpoint_path is None:
+        logger.info(f"***Converting model: facebook/mms-tts {language}***")
+
+        vocab_path = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="vocab.txt",
+            subfolder=f"models/{language}",
+        )
+        config_file = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="config.json",
+            subfolder=f"models/{language}",
+        )
+        checkpoint_path = hf_hub_download(
+            repo_id="facebook/mms-tts",
+            filename="G_100000.pth",
+            subfolder=f"models/{language}",
+        )
+
+        with open(config_file, "r") as f:
+            data = f.read()
+            hps = json.loads(data)
+
+        is_uroman = hps["data"]["training_files"].split(".")[-1] == "uroman"
+        if is_uroman:
+            logger.warning("For this checkpoint, you should use `uroman` to convert input text before tokenizing it!")
+    else:
+        logger.info(f"***Converting model: {checkpoint_path}***")
+        is_uroman = False
+
+    # original VITS checkpoint
+    if vocab_path is None:
+        _pad = "_"
+        _punctuation = ';:,.!?¡¿—…"«»“” '
+        _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+        _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
+        symbols = _pad + _punctuation + _letters + _letters_ipa
+        symbol_to_id = {s: i for i, s in enumerate(symbols)}
+        phonemize = True
+    else:
+        # Save vocab as temporary json file
+        symbols = [line.replace("\n", "") for line in open(vocab_path, encoding="utf-8").readlines()]
+        symbol_to_id = {s: i for i, s in enumerate(symbols)}
+        # MMS-TTS does not use a <pad> token, so we set to the token used to space characters
+        _pad = symbols[0]
+        phonemize = False
+
+    with tempfile.NamedTemporaryFile() as tf:
+        with open(tf.name, "w", encoding="utf-8") as f:
+            f.write(json.dumps(symbol_to_id, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        tokenizer = VitsTokenizer(tf.name, language=language, phonemize=phonemize, is_uroman=is_uroman, pad_token=_pad)
+
+    config.vocab_size = len(symbols)
+    model = VitsModel(config)
+
+    model.decoder.apply_weight_norm()
+
+    orig_checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=True)
+    recursively_load_weights(orig_checkpoint["model"], model)
+
+    model.decoder.remove_weight_norm()
+
+    model.save_pretrained(pytorch_dump_folder_path)
+    tokenizer.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        tokenizer.push_to_hub(repo_id)
+        model.push_to_hub(repo_id)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Local path to original checkpoint")
+    parser.add_argument("--vocab_path", default=None, type=str, help="Path to vocab.txt")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument("--language", default=None, type=str, help="Tokenizer language (three-letter code)")
+    parser.add_argument("--num_speakers", default=None, type=int, help="Number of speakers")
+    parser.add_argument(
+        "--sampling_rate", default=None, type=int, help="Sampling rate on which the model was trained."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_checkpoint(
+        args.pytorch_dump_folder_path,
+        args.checkpoint_path,
+        args.config_path,
+        args.vocab_path,
+        args.language,
+        args.num_speakers,
+        args.sampling_rate,
+        args.push_to_hub,
+    )
diff --git a/docs/transformers/build/lib/transformers/models/vits/modeling_vits.py b/docs/transformers/build/lib/transformers/models/vits/modeling_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..59483d3e61364e91b08521994ae1d9cb60745f47
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vits/modeling_vits.py
@@ -0,0 +1,1493 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch VITS model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import (
+    BaseModelOutput,
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_vits import VitsConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# General docstring
+_CONFIG_FOR_DOC = "VitsConfig"
+
+
+@dataclass
+class VitsModelOutput(ModelOutput):
+    """
+    Describes the outputs for the VITS model, with potential hidden states and attentions.
+
+    Args:
+        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            The final audio waveform predicted by the model.
+        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
+            The length in samples of each element in the `waveform` batch.
+        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
+            GAN decoder model to obtain the final audio waveform.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+    sequence_lengths: Optional[torch.FloatTensor] = None
+    spectrogram: Optional[Tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class VitsTextEncoderOutput(ModelOutput):
+    """
+    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            The predicted mean values of the prior distribution for the latent text variables.
+        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            The predicted log-variance values of the prior distribution for the latent text variables.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    prior_means: Optional[torch.FloatTensor] = None
+    prior_log_variances: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :num_channels, :])
+    s_act = torch.sigmoid(in_act[:, num_channels:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def _unconstrained_rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    reverse=False,
+    tail_bound=5.0,
+    min_bin_width=1e-3,
+    min_bin_height=1e-3,
+    min_derivative=1e-3,
+):
+    """
+    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
+    `tail_bound`, the transform behaves as an identity function.
+
+    Args:
+        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Second half of the hidden-states input to the Vits convolutional flow module.
+        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        reverse (`bool`, *optional*, defaults to `False`):
+            Whether the model is being run in reverse mode.
+        tail_bound (`float`, *optional* defaults to 5):
+            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
+            transform behaves as an identity function.
+        min_bin_width (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the width dimension for the piecewise rational quadratic function.
+        min_bin_height (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the height dimension for the piecewise rational quadratic function.
+        min_derivative (`float`, *optional*, defaults to 1e-3):
+            Minimum bin value across the derivatives for the piecewise rational quadratic function.
+    Returns:
+        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
+            applied.
+        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
+            limits applied.
+    """
+    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
+    outside_interval_mask = ~inside_interval_mask
+
+    outputs = torch.zeros_like(inputs)
+    log_abs_det = torch.zeros_like(inputs)
+    constant = np.log(np.exp(1 - min_derivative) - 1)
+
+    unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1))
+    unnormalized_derivatives[..., 0] = constant
+    unnormalized_derivatives[..., -1] = constant
+
+    outputs[outside_interval_mask] = inputs[outside_interval_mask]
+    log_abs_det[outside_interval_mask] = 0.0
+
+    outputs[inside_interval_mask], log_abs_det[inside_interval_mask] = _rational_quadratic_spline(
+        inputs=inputs[inside_interval_mask],
+        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
+        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
+        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
+        reverse=reverse,
+        tail_bound=tail_bound,
+        min_bin_width=min_bin_width,
+        min_bin_height=min_bin_height,
+        min_derivative=min_derivative,
+    )
+    return outputs, log_abs_det
+
+
+def _rational_quadratic_spline(
+    inputs,
+    unnormalized_widths,
+    unnormalized_heights,
+    unnormalized_derivatives,
+    reverse,
+    tail_bound,
+    min_bin_width,
+    min_bin_height,
+    min_derivative,
+):
+    """
+    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
+    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.
+
+    Args:
+        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Second half of the hidden-states input to the Vits convolutional flow module.
+        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
+            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
+            layer in the convolutional flow module
+        reverse (`bool`):
+            Whether the model is being run in reverse mode.
+        tail_bound (`float`):
+            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
+            transform behaves as an identity function.
+        min_bin_width (`float`):
+            Minimum bin value across the width dimension for the piecewise rational quadratic function.
+        min_bin_height (`float`):
+            Minimum bin value across the height dimension for the piecewise rational quadratic function.
+        min_derivative (`float`):
+            Minimum bin value across the derivatives for the piecewise rational quadratic function.
+    Returns:
+        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Hidden-states as transformed by the piecewise rational quadratic function.
+        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
+            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
+    """
+    upper_bound = tail_bound
+    lower_bound = -tail_bound
+
+    if torch.min(inputs) < lower_bound or torch.max(inputs) > upper_bound:
+        raise ValueError("Input to a transform is not within its domain")
+
+    num_bins = unnormalized_widths.shape[-1]
+
+    if min_bin_width * num_bins > 1.0:
+        raise ValueError(f"Minimal bin width {min_bin_width} too large for the number of bins {num_bins}")
+    if min_bin_height * num_bins > 1.0:
+        raise ValueError(f"Minimal bin height {min_bin_height} too large for the number of bins {num_bins}")
+
+    widths = nn.functional.softmax(unnormalized_widths, dim=-1)
+    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
+    cumwidths = torch.cumsum(widths, dim=-1)
+    cumwidths = nn.functional.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
+    cumwidths = (upper_bound - lower_bound) * cumwidths + lower_bound
+    cumwidths[..., 0] = lower_bound
+    cumwidths[..., -1] = upper_bound
+    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
+
+    derivatives = min_derivative + nn.functional.softplus(unnormalized_derivatives)
+
+    heights = nn.functional.softmax(unnormalized_heights, dim=-1)
+    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
+    cumheights = torch.cumsum(heights, dim=-1)
+    cumheights = nn.functional.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
+    cumheights = (upper_bound - lower_bound) * cumheights + lower_bound
+    cumheights[..., 0] = lower_bound
+    cumheights[..., -1] = upper_bound
+    heights = cumheights[..., 1:] - cumheights[..., :-1]
+
+    bin_locations = cumheights if reverse else cumwidths
+    bin_locations[..., -1] += 1e-6
+    bin_idx = torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
+    bin_idx = bin_idx[..., None]
+
+    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
+    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
+
+    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
+    delta = heights / widths
+    input_delta = delta.gather(-1, bin_idx)[..., 0]
+
+    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
+    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
+
+    input_heights = heights.gather(-1, bin_idx)[..., 0]
+
+    intermediate1 = input_derivatives + input_derivatives_plus_one - 2 * input_delta
+    if not reverse:
+        theta = (inputs - input_cumwidths) / input_bin_widths
+        theta_one_minus_theta = theta * (1 - theta)
+
+        numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
+        denominator = input_delta + intermediate1 * theta_one_minus_theta
+        outputs = input_cumheights + numerator / denominator
+
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * theta.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - theta).pow(2)
+        )
+        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, log_abs_det
+    else:
+        # find the roots of a quadratic equation
+        intermediate2 = inputs - input_cumheights
+        intermediate3 = intermediate2 * intermediate1
+        a = input_heights * (input_delta - input_derivatives) + intermediate3
+        b = input_heights * input_derivatives - intermediate3
+        c = -input_delta * intermediate2
+
+        discriminant = b.pow(2) - 4 * a * c
+        if not (discriminant >= 0).all():
+            raise RuntimeError(f"invalid discriminant {discriminant}")
+
+        root = (2 * c) / (-b - torch.sqrt(discriminant))
+        outputs = root * input_bin_widths + input_cumwidths
+
+        theta_one_minus_theta = root * (1 - root)
+        denominator = input_delta + intermediate1 * theta_one_minus_theta
+        derivative_numerator = input_delta.pow(2) * (
+            input_derivatives_plus_one * root.pow(2)
+            + 2 * input_delta * theta_one_minus_theta
+            + input_derivatives * (1 - root).pow(2)
+        )
+        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
+        return outputs, -log_abs_det
+
+
+class VitsWaveNet(torch.nn.Module):
+    def __init__(self, config: VitsConfig, num_layers: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_layers = num_layers
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.dropout = nn.Dropout(config.wavenet_dropout)
+
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+        else:
+            weight_norm = nn.utils.weight_norm
+
+        if config.speaker_embedding_size != 0:
+            cond_layer = torch.nn.Conv1d(config.speaker_embedding_size, 2 * config.hidden_size * num_layers, 1)
+            self.cond_layer = weight_norm(cond_layer, name="weight")
+
+        for i in range(num_layers):
+            dilation = config.wavenet_dilation_rate**i
+            padding = (config.wavenet_kernel_size * dilation - dilation) // 2
+            in_layer = torch.nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=2 * config.hidden_size,
+                kernel_size=config.wavenet_kernel_size,
+                dilation=dilation,
+                padding=padding,
+            )
+            in_layer = weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < num_layers - 1:
+                res_skip_channels = 2 * config.hidden_size
+            else:
+                res_skip_channels = config.hidden_size
+
+            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
+            res_skip_layer = weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        outputs = torch.zeros_like(inputs)
+        num_channels_tensor = torch.IntTensor([self.hidden_size])
+
+        if global_conditioning is not None:
+            global_conditioning = self.cond_layer(global_conditioning)
+
+        for i in range(self.num_layers):
+            hidden_states = self.in_layers[i](inputs)
+
+            if global_conditioning is not None:
+                cond_offset = i * 2 * self.hidden_size
+                global_states = global_conditioning[:, cond_offset : cond_offset + 2 * self.hidden_size, :]
+            else:
+                global_states = torch.zeros_like(hidden_states)
+
+            acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
+            acts = self.dropout(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.num_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = (inputs + res_acts) * padding_mask
+                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
+            else:
+                outputs = outputs + res_skip_acts
+
+        return outputs * padding_mask
+
+    def remove_weight_norm(self):
+        if self.speaker_embedding_size != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for layer in self.in_layers:
+            torch.nn.utils.remove_weight_norm(layer)
+        for layer in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(layer)
+
+
+class VitsPosteriorEncoder(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.out_channels = config.flow_size
+
+        self.conv_pre = nn.Conv1d(config.spectrogram_bins, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.posterior_encoder_num_wavenet_layers)
+        self.conv_proj = nn.Conv1d(config.hidden_size, self.out_channels * 2, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        inputs = self.conv_pre(inputs) * padding_mask
+        inputs = self.wavenet(inputs, padding_mask, global_conditioning)
+        stats = self.conv_proj(inputs) * padding_mask
+        mean, log_stddev = torch.split(stats, self.out_channels, dim=1)
+        sampled = (mean + torch.randn_like(mean) * torch.exp(log_stddev)) * padding_mask
+        return sampled, mean, log_stddev
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class VitsHifiGan(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.flow_size,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+
+        if config.speaker_embedding_size != 0:
+            self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.upsampler:
+            weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+
+    def remove_weight_norm(self):
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+
+    def forward(
+        self, spectrogram: torch.FloatTensor, global_conditioning: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        r"""
+        Converts a spectrogram into a speech waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
+                Tensor containing the spectrograms.
+            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
+                Tensor containing speaker embeddings, for multispeaker models.
+
+        Returns:
+            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
+        """
+        hidden_states = self.conv_pre(spectrogram)
+
+        if global_conditioning is not None:
+            hidden_states = hidden_states + self.cond(global_conditioning)
+
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        waveform = torch.tanh(hidden_states)
+        return waveform
+
+
+class VitsResidualCouplingLayer(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.half_channels = config.flow_size // 2
+
+        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
+        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half) * padding_mask
+        hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning)
+        mean = self.conv_post(hidden_states) * padding_mask
+        log_stddev = torch.zeros_like(mean)
+
+        if not reverse:
+            second_half = mean + second_half * torch.exp(log_stddev) * padding_mask
+            outputs = torch.cat([first_half, second_half], dim=1)
+            log_determinant = torch.sum(log_stddev, [1, 2])
+            return outputs, log_determinant
+        else:
+            second_half = (second_half - mean) * torch.exp(-log_stddev) * padding_mask
+            outputs = torch.cat([first_half, second_half], dim=1)
+            return outputs, None
+
+
+class VitsResidualCouplingBlock(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(config.prior_encoder_num_flows):
+            self.flows.append(VitsResidualCouplingLayer(config))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                inputs, _ = flow(inputs, padding_mask, global_conditioning)
+                inputs = torch.flip(inputs, [1])
+        else:
+            for flow in reversed(self.flows):
+                inputs = torch.flip(inputs, [1])
+                inputs, _ = flow(inputs, padding_mask, global_conditioning, reverse=True)
+        return inputs
+
+
+class VitsDilatedDepthSeparableConv(nn.Module):
+    def __init__(self, config: VitsConfig, dropout_rate=0.0):
+        super().__init__()
+        kernel_size = config.duration_predictor_kernel_size
+        channels = config.hidden_size
+        self.num_layers = config.depth_separable_num_layers
+
+        self.dropout = nn.Dropout(dropout_rate)
+        self.convs_dilated = nn.ModuleList()
+        self.convs_pointwise = nn.ModuleList()
+        self.norms_1 = nn.ModuleList()
+        self.norms_2 = nn.ModuleList()
+        for i in range(self.num_layers):
+            dilation = kernel_size**i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.convs_dilated.append(
+                nn.Conv1d(
+                    in_channels=channels,
+                    out_channels=channels,
+                    kernel_size=kernel_size,
+                    groups=channels,
+                    dilation=dilation,
+                    padding=padding,
+                )
+            )
+            self.convs_pointwise.append(nn.Conv1d(channels, channels, 1))
+            self.norms_1.append(nn.LayerNorm(channels))
+            self.norms_2.append(nn.LayerNorm(channels))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        if global_conditioning is not None:
+            inputs = inputs + global_conditioning
+
+        for i in range(self.num_layers):
+            hidden_states = self.convs_dilated[i](inputs * padding_mask)
+            hidden_states = self.norms_1[i](hidden_states.transpose(1, -1)).transpose(1, -1)
+            hidden_states = nn.functional.gelu(hidden_states)
+            hidden_states = self.convs_pointwise[i](hidden_states)
+            hidden_states = self.norms_2[i](hidden_states.transpose(1, -1)).transpose(1, -1)
+            hidden_states = nn.functional.gelu(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            inputs = inputs + hidden_states
+
+        return inputs * padding_mask
+
+
+class VitsConvFlow(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.filter_channels = config.hidden_size
+        self.half_channels = config.depth_separable_channels // 2
+        self.num_bins = config.duration_predictor_flow_bins
+        self.tail_bound = config.duration_predictor_tail_bound
+
+        self.conv_pre = nn.Conv1d(self.half_channels, self.filter_channels, 1)
+        self.conv_dds = VitsDilatedDepthSeparableConv(config)
+        self.conv_proj = nn.Conv1d(self.filter_channels, self.half_channels * (self.num_bins * 3 - 1), 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
+
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.conv_dds(hidden_states, padding_mask, global_conditioning)
+        hidden_states = self.conv_proj(hidden_states) * padding_mask
+
+        batch_size, channels, length = first_half.shape
+        hidden_states = hidden_states.reshape(batch_size, channels, -1, length).permute(0, 1, 3, 2)
+
+        unnormalized_widths = hidden_states[..., : self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_heights = hidden_states[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels)
+        unnormalized_derivatives = hidden_states[..., 2 * self.num_bins :]
+
+        second_half, log_abs_det = _unconstrained_rational_quadratic_spline(
+            second_half,
+            unnormalized_widths,
+            unnormalized_heights,
+            unnormalized_derivatives,
+            reverse=reverse,
+            tail_bound=self.tail_bound,
+        )
+
+        outputs = torch.cat([first_half, second_half], dim=1) * padding_mask
+        if not reverse:
+            log_determinant = torch.sum(log_abs_det * padding_mask, [1, 2])
+            return outputs, log_determinant
+        else:
+            return outputs, None
+
+
+class VitsElementwiseAffine(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.channels = config.depth_separable_channels
+        self.translate = nn.Parameter(torch.zeros(self.channels, 1))
+        self.log_scale = nn.Parameter(torch.zeros(self.channels, 1))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
+        if not reverse:
+            outputs = self.translate + torch.exp(self.log_scale) * inputs
+            outputs = outputs * padding_mask
+            log_determinant = torch.sum(self.log_scale * padding_mask, [1, 2])
+            return outputs, log_determinant
+        else:
+            outputs = (inputs - self.translate) * torch.exp(-self.log_scale) * padding_mask
+            return outputs, None
+
+
+class VitsStochasticDurationPredictor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.speaker_embedding_size
+        filter_channels = config.hidden_size
+
+        self.conv_pre = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.conv_dds = VitsDilatedDepthSeparableConv(
+            config,
+            dropout_rate=config.duration_predictor_dropout,
+        )
+
+        if embed_dim != 0:
+            self.cond = nn.Conv1d(embed_dim, filter_channels, 1)
+
+        self.flows = nn.ModuleList()
+        self.flows.append(VitsElementwiseAffine(config))
+        for _ in range(config.duration_predictor_num_flows):
+            self.flows.append(VitsConvFlow(config))
+
+        self.post_conv_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_conv_dds = VitsDilatedDepthSeparableConv(
+            config,
+            dropout_rate=config.duration_predictor_dropout,
+        )
+
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(VitsElementwiseAffine(config))
+        for _ in range(config.duration_predictor_num_flows):
+            self.post_flows.append(VitsConvFlow(config))
+
+    def forward(self, inputs, padding_mask, global_conditioning=None, durations=None, reverse=False, noise_scale=1.0):
+        inputs = torch.detach(inputs)
+        inputs = self.conv_pre(inputs)
+
+        if global_conditioning is not None:
+            global_conditioning = torch.detach(global_conditioning)
+            inputs = inputs + self.cond(global_conditioning)
+
+        inputs = self.conv_dds(inputs, padding_mask)
+        inputs = self.conv_proj(inputs) * padding_mask
+
+        if not reverse:
+            hidden_states = self.post_conv_pre(durations)
+            hidden_states = self.post_conv_dds(hidden_states, padding_mask)
+            hidden_states = self.post_conv_proj(hidden_states) * padding_mask
+
+            random_posterior = (
+                torch.randn(durations.size(0), 2, durations.size(2)).to(device=inputs.device, dtype=inputs.dtype)
+                * padding_mask
+            )
+            log_determinant_posterior_sum = 0
+            latents_posterior = random_posterior
+            for flow in self.post_flows:
+                latents_posterior, log_determinant = flow(
+                    latents_posterior, padding_mask, global_conditioning=inputs + hidden_states
+                )
+                latents_posterior = torch.flip(latents_posterior, [1])
+                log_determinant_posterior_sum += log_determinant
+
+            first_half, second_half = torch.split(latents_posterior, [1, 1], dim=1)
+
+            log_determinant_posterior_sum += torch.sum(
+                (nn.functional.logsigmoid(first_half) + nn.functional.logsigmoid(-first_half)) * padding_mask, [1, 2]
+            )
+            logq = (
+                torch.sum(-0.5 * (math.log(2 * math.pi) + (random_posterior**2)) * padding_mask, [1, 2])
+                - log_determinant_posterior_sum
+            )
+
+            first_half = (durations - torch.sigmoid(first_half)) * padding_mask
+            first_half = torch.log(torch.clamp_min(first_half, 1e-5)) * padding_mask
+            log_determinant_sum = torch.sum(-first_half, [1, 2])
+
+            latents = torch.cat([first_half, second_half], dim=1)
+            for flow in self.flows:
+                latents, log_determinant = flow(latents, padding_mask, global_conditioning=inputs)
+                latents = torch.flip(latents, [1])
+                log_determinant_sum += log_determinant
+
+            nll = torch.sum(0.5 * (math.log(2 * math.pi) + (latents**2)) * padding_mask, [1, 2]) - log_determinant_sum
+            return nll + logq
+        else:
+            flows = list(reversed(self.flows))
+            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+
+            latents = (
+                torch.randn(inputs.size(0), 2, inputs.size(2)).to(device=inputs.device, dtype=inputs.dtype)
+                * noise_scale
+            )
+            for flow in flows:
+                latents = torch.flip(latents, [1])
+                latents, _ = flow(latents, padding_mask, global_conditioning=inputs, reverse=True)
+
+            log_duration, _ = torch.split(latents, [1, 1], dim=1)
+            return log_duration
+
+
+class VitsDurationPredictor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = config.duration_predictor_kernel_size
+        filter_channels = config.duration_predictor_filter_channels
+
+        self.dropout = nn.Dropout(config.duration_predictor_dropout)
+        self.conv_1 = nn.Conv1d(config.hidden_size, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_1 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
+        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
+        self.norm_2 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+
+        if config.speaker_embedding_size != 0:
+            self.cond = nn.Conv1d(config.speaker_embedding_size, config.hidden_size, 1)
+
+    def forward(self, inputs, padding_mask, global_conditioning=None):
+        inputs = torch.detach(inputs)
+
+        if global_conditioning is not None:
+            global_conditioning = torch.detach(global_conditioning)
+            inputs = inputs + self.cond(global_conditioning)
+
+        inputs = self.conv_1(inputs * padding_mask)
+        inputs = torch.relu(inputs)
+        inputs = self.norm_1(inputs.transpose(1, -1)).transpose(1, -1)
+        inputs = self.dropout(inputs)
+
+        inputs = self.conv_2(inputs * padding_mask)
+        inputs = torch.relu(inputs)
+        inputs = self.norm_2(inputs.transpose(1, -1)).transpose(1, -1)
+        inputs = self.dropout(inputs)
+
+        inputs = self.proj(inputs * padding_mask)
+        return inputs * padding_mask
+
+
+class VitsAttention(nn.Module):
+    """Multi-headed attention with relative positional representation."""
+
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.dropout = config.attention_dropout
+        self.window_size = config.window_size
+
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+
+        if (self.head_dim * self.num_heads) != self.embed_dim:
+            raise ValueError(
+                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.embed_dim}"
+                f" and `num_attention_heads`: {self.num_heads})."
+            )
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+
+        if self.window_size:
+            self.emb_rel_k = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
+            self.emb_rel_v = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        # self_attention
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if self.window_size is not None:
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
+            relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
+            rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
+            attn_weights += rel_pos_bias
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        if self.window_size is not None:
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
+            relative_weights = self._absolute_position_to_relative_position(attn_probs)
+            rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
+            attn_output += rel_pos_bias
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        pad_length = max(length - (self.window_size + 1), 0)
+        if pad_length > 0:
+            relative_embeddings = nn.functional.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])
+
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        return relative_embeddings[:, slice_start_position:slice_end_position]
+
+    def _relative_position_to_absolute_position(self, x):
+        batch_heads, length, _ = x.size()
+
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = nn.functional.pad(x, [0, 1, 0, 0, 0, 0])
+
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch_heads, length * 2 * length])
+        x_flat = nn.functional.pad(x_flat, [0, length - 1, 0, 0])
+
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch_heads, length + 1, 2 * length - 1])
+        x_final = x_final[:, :length, length - 1 :]
+        return x_final
+
+    def _absolute_position_to_relative_position(self, x):
+        batch_heads, length, _ = x.size()
+
+        # Pad along column
+        x = nn.functional.pad(x, [0, length - 1, 0, 0, 0, 0])
+        x_flat = x.view([batch_heads, length * (2 * length - 1)])
+
+        # Add 0's in the beginning that will skew the elements after reshape
+        x_flat = nn.functional.pad(x_flat, [length, 0, 0, 0])
+        x_final = x_flat.view([batch_heads, length, 2 * length])[:, :, 1:]
+        return x_final
+
+
+class VitsFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size)
+        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+
+        if isinstance(config.hidden_act, str):
+            self.act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.act_fn = config.hidden_act
+
+        if config.ffn_kernel_size > 1:
+            pad_left = (config.ffn_kernel_size - 1) // 2
+            pad_right = config.ffn_kernel_size // 2
+            self.padding = [pad_left, pad_right, 0, 0, 0, 0]
+        else:
+            self.padding = None
+
+    def forward(self, hidden_states, padding_mask):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        padding_mask = padding_mask.permute(0, 2, 1)
+
+        hidden_states = hidden_states * padding_mask
+        if self.padding is not None:
+            hidden_states = nn.functional.pad(hidden_states, self.padding)
+
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states * padding_mask
+        if self.padding is not None:
+            hidden_states = nn.functional.pad(hidden_states, self.padding)
+
+        hidden_states = self.conv_2(hidden_states)
+        hidden_states = hidden_states * padding_mask
+
+        hidden_states = hidden_states.permute(0, 2, 1)
+        return hidden_states
+
+
+class VitsEncoderLayer(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.attention = VitsAttention(config)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = VitsFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states, attn_weights = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(residual + hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states, padding_mask)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.final_layer_norm(residual + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class VitsEncoder(nn.Module):
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.layerdrop = config.layerdrop
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        hidden_states = hidden_states * padding_mask
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+
+            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        padding_mask,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        padding_mask=padding_mask,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = hidden_states * padding_mask
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VitsTextEncoder(nn.Module):
+    """
+    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
+    """
+
+    def __init__(self, config: VitsConfig):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)
+        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        padding_mask: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], VitsTextEncoderOutput]:
+        hidden_states = self.embed_tokens(input_ids) * math.sqrt(self.config.hidden_size)
+
+        encoder_outputs = self.encoder(
+            hidden_states=hidden_states,
+            padding_mask=padding_mask,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state
+
+        stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2) * padding_mask
+        prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2)
+
+        if not return_dict:
+            outputs = (last_hidden_state, prior_means, prior_log_variances) + encoder_outputs[1:]
+            return outputs
+
+        return VitsTextEncoderOutput(
+            last_hidden_state=last_hidden_state,
+            prior_means=prior_means,
+            prior_log_variances=prior_log_variances,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class VitsPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VitsConfig
+    base_model_prefix = "vits"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+VITS_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`VitsConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+VITS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        speaker_id (`int`, *optional*):
+            Which speaker embedding to use. Only used for multispeaker models.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The complete VITS model, for text-to-speech synthesis.",
+    VITS_START_DOCSTRING,
+)
+class VitsModel(VitsPreTrainedModel):
+    def __init__(self, config: VitsConfig):
+        super().__init__(config)
+        self.config = config
+        self.text_encoder = VitsTextEncoder(config)
+        self.flow = VitsResidualCouplingBlock(config)
+        self.decoder = VitsHifiGan(config)
+
+        if config.use_stochastic_duration_prediction:
+            self.duration_predictor = VitsStochasticDurationPredictor(config)
+        else:
+            self.duration_predictor = VitsDurationPredictor(config)
+
+        if config.num_speakers > 1:
+            self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size)
+
+        # This is used only for training.
+        self.posterior_encoder = VitsPosteriorEncoder(config)
+
+        # These parameters control the synthesised speech properties
+        self.speaking_rate = config.speaking_rate
+        self.noise_scale = config.noise_scale
+        self.noise_scale_duration = config.noise_scale_duration
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    @add_start_docstrings_to_model_forward(VITS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VitsModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        speaker_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple[Any], VitsModelOutput]:
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
+            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
+            computation.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import VitsTokenizer, VitsModel, set_seed
+        >>> import torch
+
+        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
+        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")
+
+        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> with torch.no_grad():
+        ...     outputs = model(inputs["input_ids"])
+        >>> outputs.waveform.shape
+        torch.Size([1, 45824])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            raise NotImplementedError("Training of VITS is not supported yet.")
+
+        mask_dtype = self.text_encoder.embed_tokens.weight.dtype
+        if attention_mask is not None:
+            input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
+        else:
+            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
+
+        if self.config.num_speakers > 1 and speaker_id is not None:
+            if not 0 <= speaker_id < self.config.num_speakers:
+                raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.")
+            if isinstance(speaker_id, int):
+                speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device)
+            speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1)
+        else:
+            speaker_embeddings = None
+
+        text_encoder_output = self.text_encoder(
+            input_ids=input_ids,
+            padding_mask=input_padding_mask,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
+        hidden_states = hidden_states.transpose(1, 2)
+        input_padding_mask = input_padding_mask.transpose(1, 2)
+        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
+        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
+
+        if self.config.use_stochastic_duration_prediction:
+            log_duration = self.duration_predictor(
+                hidden_states,
+                input_padding_mask,
+                speaker_embeddings,
+                reverse=True,
+                noise_scale=self.noise_scale_duration,
+            )
+        else:
+            log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
+
+        length_scale = 1.0 / self.speaking_rate
+        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
+        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
+
+        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
+        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
+        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
+        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
+
+        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
+        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
+        batch_size, _, output_length, input_length = attn_mask.shape
+        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
+        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
+        valid_indices = indices.unsqueeze(0) < cum_duration
+        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
+        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
+        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+
+        # Expand prior distribution
+        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
+        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
+
+        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
+        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
+
+        spectrogram = latents * output_padding_mask
+        waveform = self.decoder(spectrogram, speaker_embeddings)
+        waveform = waveform.squeeze(1)
+        sequence_lengths = predicted_lengths * np.prod(self.config.upsample_rates)
+
+        if not return_dict:
+            outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
+            return outputs
+
+        return VitsModelOutput(
+            waveform=waveform,
+            sequence_lengths=sequence_lengths,
+            spectrogram=spectrogram,
+            hidden_states=text_encoder_output.hidden_states,
+            attentions=text_encoder_output.attentions,
+        )
+
+
+__all__ = ["VitsModel", "VitsPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/vits/tokenization_vits.py b/docs/transformers/build/lib/transformers/models/vits/tokenization_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca40c80c124cae78e04bc65fd67982dcbd6020d6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vits/tokenization_vits.py
@@ -0,0 +1,246 @@
+# coding=utf-8
+# Copyright 2023 The Kakao Enterprise Authors, the MMS-TTS Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for VITS."""
+
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_phonemizer_available, is_uroman_available, logging
+
+
+if is_phonemizer_available():
+    import phonemizer
+
+if is_uroman_available():
+    import uroman as ur
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+
+
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+
+
+class VitsTokenizer(PreTrainedTokenizer):
+    """
+    Construct a VITS tokenizer. Also supports MMS-TTS.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        language (`str`, *optional*):
+            Language identifier.
+        add_blank (`bool`, *optional*, defaults to `True`):
+            Whether to insert token id 0 in between the other tokens.
+        normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the input text by removing all casing and punctuation.
+        phonemize (`bool`, *optional*, defaults to `True`):
+            Whether to convert the input text into phonemes.
+        is_uroman (`bool`, *optional*, defaults to `False`):
+            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ) -> None:
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+
+        self.is_uroman = is_uroman
+
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+
+        return filtered_text
+
+    def _preprocess_char(self, text):
+        """Special treatment of characters in certain languages"""
+        if self.language == "ron":
+            text = text.replace("ț", "ţ")
+        return text
+
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize.
+            normalize (`bool`, *optional*, defaults to `None`):
+                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
+                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
+                text consists only of lower-case characters.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Keyword arguments to use for the tokenization.
+
+        Returns:
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        normalize = normalize if normalize is not None else self.normalize
+
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+
+        filtered_text = self._preprocess_char(text)
+
+        if has_non_roman_characters(filtered_text) and self.is_uroman:
+            if not is_uroman_available():
+                logger.warning(
+                    "Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing "
+                    "step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` "
+                    "Note `uroman` requires python version >= 3.10"
+                    "Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uroman"
+                )
+            else:
+                uroman = ur.Uroman()
+                filtered_text = uroman.romanize_string(filtered_text)
+
+        if self.phonemize:
+            if not is_phonemizer_available():
+                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
+
+            filtered_text = phonemizer.phonemize(
+                filtered_text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=True,
+                with_stress=True,
+            )
+            filtered_text = re.sub(r"\s+", " ", filtered_text)
+        elif normalize:
+            # strip any chars outside of the vocab (punctuation)
+            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
+
+        return filtered_text, kwargs
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+
+        if self.add_blank:
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2 + 1)
+            interspersed[1::2] = tokens
+            tokens = interspersed
+
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        if self.add_blank and len(tokens) > 1:
+            tokens = tokens[1::2]
+        return "".join(tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Union[Tuple[str], None]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
+
+
+__all__ = ["VitsTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/vivit/__init__.py b/docs/transformers/build/lib/transformers/models/vivit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c52cbcff0b3fc6dc3255f41729754991dd3c8f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vivit/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vivit import *
+    from .image_processing_vivit import *
+    from .modeling_vivit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/vivit/configuration_vivit.py b/docs/transformers/build/lib/transformers/models/vivit/configuration_vivit.py
new file mode 100644
index 0000000000000000000000000000000000000000..42863454e81c278ee7af62fee06b19ec9b6ca2f0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vivit/configuration_vivit.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViViT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VivitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VivitModel`]. It is used to instantiate a ViViT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ViViT
+    [google/vivit-b-16x2-kinetics400](https://huggingface.co/google/vivit-b-16x2-kinetics400) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_frames (`int`, *optional*, defaults to 32):
+            The number of frames in each video.
+        tubelet_size (`List[int]`, *optional*, defaults to `[2, 16, 16]`):
+            The size (resolution) of each tubelet.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_fast"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"gelu_fast"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import VivitConfig, VivitModel
+
+    >>> # Initializing a ViViT google/vivit-b-16x2-kinetics400 style configuration
+    >>> configuration = VivitConfig()
+
+    >>> # Initializing a model (with random weights) from the google/vivit-b-16x2-kinetics400 style configuration
+    >>> model = VivitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vivit"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_frames=32,
+        tubelet_size=[2, 16, 16],
+        num_channels=3,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_fast",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-06,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.image_size = image_size
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["VivitConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/vivit/convert_vivit_flax_to_pytorch.py b/docs/transformers/build/lib/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4b5e1cfda311b51f62b85a059e420841feda708
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vivit/convert_vivit_flax_to_pytorch.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Flax ViViT checkpoints from the original repository to PyTorch. URL:
+https://github.com/google-research/scenic/tree/main/scenic/projects/vivit
+"""
+
+import argparse
+import json
+import os.path
+from collections import OrderedDict
+
+import numpy as np
+import requests
+import torch
+from flax.training.checkpoints import restore_checkpoint
+from huggingface_hub import hf_hub_download
+
+from transformers import VivitConfig, VivitForVideoClassification, VivitImageProcessor
+from transformers.image_utils import PILImageResampling
+
+
+def download_checkpoint(path):
+    url = "https://storage.googleapis.com/scenic-bucket/vivit/kinetics_400/vivit_base_16x2_unfactorized/checkpoint"
+
+    with open(path, "wb") as f:
+        with requests.get(url, stream=True) as req:
+            for chunk in req.iter_content(chunk_size=2048):
+                f.write(chunk)
+
+
+def get_vivit_config() -> VivitConfig:
+    config = VivitConfig()
+
+    config.num_labels = 400
+    repo_id = "huggingface/label-files"
+    filename = "kinetics400-id2label.json"
+
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+    return config
+
+
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [ 47, 51, 55, 59, 63, 67, 71, 75, 80, 84, 88, 92, 96, 100, 104, 108, 113, 117,
+# 121, 125, 129, 133, 137, 141, 146, 150, 154, 158, 162, 166, 170, 174]
+def prepare_video():
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti_32_frames.npy", repo_type="dataset"
+    )
+    video = np.load(file)
+    return list(video)
+
+
+def transform_attention(current: np.ndarray):
+    if np.ndim(current) == 2:
+        return transform_attention_bias(current)
+
+    elif np.ndim(current) == 3:
+        return transform_attention_kernel(current)
+
+    else:
+        raise Exception(f"Invalid number of dimensions: {np.ndim(current)}")
+
+
+def transform_attention_bias(current: np.ndarray):
+    return current.flatten()
+
+
+def transform_attention_kernel(current: np.ndarray):
+    return np.reshape(current, (current.shape[0], current.shape[1] * current.shape[2])).T
+
+
+def transform_attention_output_weight(current: np.ndarray):
+    return np.reshape(current, (current.shape[0] * current.shape[1], current.shape[2])).T
+
+
+def transform_state_encoder_block(state_dict, i):
+    state = state_dict["optimizer"]["target"]["Transformer"][f"encoderblock_{i}"]
+
+    prefix = f"encoder.layer.{i}."
+    new_state = {
+        prefix + "intermediate.dense.bias": state["MlpBlock_0"]["Dense_0"]["bias"],
+        prefix + "intermediate.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_0"]["kernel"]),
+        prefix + "output.dense.bias": state["MlpBlock_0"]["Dense_1"]["bias"],
+        prefix + "output.dense.weight": np.transpose(state["MlpBlock_0"]["Dense_1"]["kernel"]),
+        prefix + "layernorm_before.bias": state["LayerNorm_0"]["bias"],
+        prefix + "layernorm_before.weight": state["LayerNorm_0"]["scale"],
+        prefix + "layernorm_after.bias": state["LayerNorm_1"]["bias"],
+        prefix + "layernorm_after.weight": state["LayerNorm_1"]["scale"],
+        prefix + "attention.attention.query.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["bias"]
+        ),
+        prefix + "attention.attention.query.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["query"]["kernel"]
+        ),
+        prefix + "attention.attention.key.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["key"]["bias"]
+        ),
+        prefix + "attention.attention.key.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["key"]["kernel"]
+        ),
+        prefix + "attention.attention.value.bias": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["bias"]
+        ),
+        prefix + "attention.attention.value.weight": transform_attention(
+            state["MultiHeadDotProductAttention_0"]["value"]["kernel"]
+        ),
+        prefix + "attention.output.dense.bias": state["MultiHeadDotProductAttention_0"]["out"]["bias"],
+        prefix + "attention.output.dense.weight": transform_attention_output_weight(
+            state["MultiHeadDotProductAttention_0"]["out"]["kernel"]
+        ),
+    }
+
+    return new_state
+
+
+def get_n_layers(state_dict):
+    return sum([1 if "encoderblock_" in k else 0 for k in state_dict["optimizer"]["target"]["Transformer"].keys()])
+
+
+def transform_state(state_dict, classification_head=False):
+    transformer_layers = get_n_layers(state_dict)
+
+    new_state = OrderedDict()
+
+    new_state["layernorm.bias"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["bias"]
+    new_state["layernorm.weight"] = state_dict["optimizer"]["target"]["Transformer"]["encoder_norm"]["scale"]
+
+    new_state["embeddings.patch_embeddings.projection.weight"] = np.transpose(
+        state_dict["optimizer"]["target"]["embedding"]["kernel"], (4, 3, 0, 1, 2)
+    )
+    new_state["embeddings.patch_embeddings.projection.bias"] = state_dict["optimizer"]["target"]["embedding"]["bias"]
+
+    new_state["embeddings.cls_token"] = state_dict["optimizer"]["target"]["cls"]
+    new_state["embeddings.position_embeddings"] = state_dict["optimizer"]["target"]["Transformer"]["posembed_input"][
+        "pos_embedding"
+    ]
+
+    for i in range(transformer_layers):
+        new_state.update(transform_state_encoder_block(state_dict, i))
+
+    if classification_head:
+        new_state = {"vivit." + k: v for k, v in new_state.items()}
+        new_state["classifier.weight"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["kernel"])
+        new_state["classifier.bias"] = np.transpose(state_dict["optimizer"]["target"]["output_projection"]["bias"])
+
+    return {k: torch.tensor(v) for k, v in new_state.items()}
+
+
+# checks that image processor settings are the same as in the original implementation
+# original: https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/data/video_tfrecord_dataset.py
+# dataset specific config:
+# https://github.com/google-research/scenic/blob/main/scenic/projects/vivit/configs/kinetics400/vivit_base_k400.py
+def get_processor() -> VivitImageProcessor:
+    extractor = VivitImageProcessor()
+
+    assert extractor.do_resize is True
+    assert extractor.size == {"shortest_edge": 256}
+    assert extractor.do_center_crop is True
+    assert extractor.crop_size == {"width": 224, "height": 224}
+    assert extractor.resample == PILImageResampling.BILINEAR
+
+    # here: https://github.com/deepmind/dmvr/blob/master/dmvr/modalities.py
+    # one can seen that add_image has default values for normalization_mean and normalization_std set to 0 and 1
+    # which effectively means no normalization (and ViViT does not overwrite those when calling this func)
+    assert extractor.do_normalize is False
+    assert extractor.do_rescale is True
+    assert extractor.rescale_factor == 1 / 255
+
+    # zero-centering = True in original implementation
+    assert extractor.do_zero_centering is True
+
+    return extractor
+
+
+def convert(output_path: str):
+    flax_model_path = "checkpoint"
+
+    if not os.path.exists(flax_model_path):
+        download_checkpoint(flax_model_path)
+
+    state_dict = restore_checkpoint(flax_model_path, None)
+    new_state = transform_state(state_dict, classification_head=True)
+
+    config = get_vivit_config()
+
+    assert config.image_size == 224
+    assert config.num_frames == 32
+
+    model = VivitForVideoClassification(config)
+    model.load_state_dict(new_state)
+    model.eval()
+
+    extractor = get_processor()
+
+    video = prepare_video()
+    inputs = extractor(video, return_tensors="pt")
+
+    outputs = model(**inputs)
+
+    expected_shape = torch.Size([1, 400])
+    expected_slice = torch.tensor([-1.0543, 2.0764, -0.2104, 0.4439, -0.9658])
+
+    assert outputs.logits.shape == expected_shape
+    assert torch.allclose(outputs.logits[0, :5], expected_slice, atol=1e-4), outputs.logits[0, :5]
+
+    model.save_pretrained(output_path)
+    extractor.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output_model_name", "-o", type=str, help="Output path for the converted HuggingFace model")
+
+    args = parser.parse_args()
+    convert(args.output_model_name)
diff --git a/docs/transformers/build/lib/transformers/models/vivit/image_processing_vivit.py b/docs/transformers/build/lib/transformers/models/vivit/image_processing_vivit.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b369be41ba883b0d9a853a043a7a17417024fc4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vivit/image_processing_vivit.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Vivit."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.utils import is_vision_available
+from transformers.utils.generic import TensorType
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import filter_out_non_signature_kwargs, logging
+
+
+if is_vision_available():
+    import PIL
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        return [videos]
+
+    elif is_valid_image(videos):
+        return [[videos]]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class VivitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Vivit image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 256}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/127.5`):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        offset (`bool`, *optional*, defaults to `True`):
+            Whether to scale the image in both negative and positive directions. Can be overriden by the `offset` in
+            the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 127.5,
+        offset: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 256}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.offset = offset
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.efficientnet.image_processing_efficientnet.EfficientNetImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        offset: bool = True,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Rescale an image by a scale factor.
+
+        If `offset` is `True`, the image has its values rescaled by `scale` and then offset by 1. If `scale` is
+        1/127.5, the image is rescaled between [-1, 1].
+            image = image * scale - 1
+
+        If `offset` is `False`, and `scale` is 1/255, the image is rescaled between [0, 1].
+            image = image * scale
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            offset (`bool`, *optional*):
+                Whether to scale the image in both negative and positive directions.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        rescaled_image = rescale(
+            image, scale=scale, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+        if offset:
+            rescaled_image = rescaled_image - 1
+
+        return rescaled_image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if offset and not do_rescale:
+            raise ValueError("For offset, do_rescale must also be set to True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, offset=offset, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        offset: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            videos (`ImageInput`):
+                Video frames to preprocess. Expects a single or batch of video frames with pixel values ranging from 0
+                to 255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between `[-1 - 1]` if `offset` is `True`, `[0, 1]` otherwise.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            offset (`bool`, *optional*, defaults to `self.offset`):
+                Whether to scale the image in both negative and positive directions.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        offset = offset if offset is not None else self.offset
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    offset=offset,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        data = {"pixel_values": videos}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["VivitImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/vivit/modeling_vivit.py b/docs/transformers/build/lib/transformers/models/vivit/modeling_vivit.py
new file mode 100644
index 0000000000000000000000000000000000000000..669106239a06a83cb1f4737f76eb7a5bed474227
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/vivit/modeling_vivit.py
@@ -0,0 +1,844 @@
+# coding=utf-8
+# Copyright 2023 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViViT model."""
+
+from typing import Callable, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_vivit import VivitConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/vivit-b-16x2-kinetics400"
+_CONFIG_FOR_DOC = "VivitConfig"
+
+
+class VivitTubeletEmbeddings(nn.Module):
+    """
+    Construct Vivit Tubelet embeddings.
+
+    This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
+    shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
+
+    The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
+    (width // tubelet_size[2]).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_frames = config.num_frames
+        self.image_size = config.image_size
+        self.patch_size = config.tubelet_size
+        self.num_patches = (
+            (self.image_size // self.patch_size[2])
+            * (self.image_size // self.patch_size[1])
+            * (self.num_frames // self.patch_size[0])
+        )
+        self.embed_dim = config.hidden_size
+
+        self.projection = nn.Conv3d(
+            config.num_channels, config.hidden_size, kernel_size=config.tubelet_size, stride=config.tubelet_size
+        )
+
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Image image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        # permute to (batch_size, num_channels, num_frames, height, width)
+        pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
+
+        x = self.projection(pixel_values)
+        # out_batch_size, out_num_channels, out_num_frames, out_height, out_width = x.shape
+        # flattens time and space dimensions, transposes to (out_batch_size, flat_tokens, out_num_channels)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+class VivitEmbeddings(nn.Module):
+    """
+    Vivit Embeddings.
+
+    Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = VivitTubeletEmbeddings(config)
+
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.patch_embeddings.num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.tubelet_size[1:]
+        self.config = config
+
+    # Adapted from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size[0]
+        new_width = width // self.patch_size[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        cls_tokens = self.cls_token.tile([batch_size, 1, 1])
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Vivit
+class VivitSelfAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Vivit
+class VivitSelfOutput(nn.Module):
+    """
+    The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Vivit
+class VivitAttention(nn.Module):
+    def __init__(self, config: VivitConfig) -> None:
+        super().__init__()
+        self.attention = VivitSelfAttention(config)
+        self.output = VivitSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VivitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class VivitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class VivitLayer(nn.Module):
+    """This corresponds to the EncoderBlock class in the scenic/vivit implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VivitAttention(config)
+        self.intermediate = VivitIntermediate(config)
+        self.output = VivitOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            # in Vivit, layernorm is applied before self-attention
+            self.layernorm_before(hidden_states),
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        # add self attentions if we output attention weights
+        outputs = self_attention_outputs[1:]
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Vivit, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class VivitEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VivitLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class VivitPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class VivitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VivitConfig
+    base_model_prefix = "vivit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, VivitEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+
+
+VIVIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VivitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIVIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VivitImageProcessor`]. See
+            [`VivitImageProcessor.preprocess`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViViT Transformer model outputting raw hidden-states without any specific head on top.",
+    VIVIT_START_DOCSTRING,
+)
+class VivitModel(VivitPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VivitEmbeddings(config)
+        self.encoder = VivitEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = VivitPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model.
+
+        Args:
+            heads_to_prune:
+                dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+
+        >>> from transformers import VivitImageProcessor, VivitModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container=container, indices=indices)
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> # prepare video for the model
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 3137, 768]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
+[CLS] token) e.g. for Kinetics-400.
+
+    <Tip>
+
+        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """,
+    VIVIT_START_DOCSTRING,
+)
+class VivitForVideoClassification(VivitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vivit = VivitModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIVIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+        >>> import torch
+
+        >>> from transformers import VivitImageProcessor, VivitForVideoClassification
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 32 frames
+        >>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container=container, indices=indices)
+
+        >>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
+        >>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")
+
+        >>> inputs = image_processor(list(video), return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     logits = outputs.logits
+
+        >>> # model predicts one of the 400 Kinetics-400 classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        LABEL_116
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vivit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["VivitModel", "VivitPreTrainedModel", "VivitForVideoClassification"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/__init__.py b/docs/transformers/build/lib/transformers/models/wav2vec2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3516b478194df713d15941c5d646fbdba987bfb0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wav2vec2 import *
+    from .feature_extraction_wav2vec2 import *
+    from .modeling_flax_wav2vec2 import *
+    from .modeling_tf_wav2vec2 import *
+    from .modeling_wav2vec2 import *
+    from .processing_wav2vec2 import *
+    from .tokenization_wav2vec2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/configuration_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/configuration_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28aa6305f855c796a4ffaa565798daeb5d66028
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -0,0 +1,347 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wav2Vec2 model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2Model`]. It is used to instantiate an
+    Wav2Vec2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Wav2Vec2
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the Wav2Vec2 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Wav2Vec2Model`] or [`TFWav2Vec2Model`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for quantized feature encoder states.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2ForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
+            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        adapter_attn_dim (`int`, *optional*):
+            Dimension of the attention adapter weights to be used in each attention block. An example of a model using
+            attention adapters is [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2Config, Wav2Vec2Model
+
+    >>> # Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Wav2Vec2Config()
+
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Wav2Vec2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wav2vec2"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        adapter_attn_dim=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+        self.adapter_attn_dim = adapter_attn_dim
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["Wav2Vec2Config"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..5613f83a86b4e79450a4068e0152df649565bfd2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,385 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2 checkpoint."""
+
+import argparse
+import json
+import os
+
+import fairseq
+import torch
+from fairseq.data import Dictionary
+
+from transformers import (
+    Wav2Vec2Config,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForPreTraining,
+    Wav2Vec2Processor,
+    logging,
+)
+from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ForSequenceClassification
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+    "fc2": "encoder.layers.*.feed_forward.output_dense",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "adapter_layer": "encoder.layers.*.adapter_layer",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    "final_proj": "project_hid",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+    "pooling_layer.linear": "projector",
+    "pooling_layer.projection": "classifier",
+}
+TOP_LEVEL_KEYS = [
+    "lm_head",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+    "projector",
+    "classifier",
+]
+
+
+def read_txt_into_dict(filename):
+    result = {}
+    with open(filename, "r") as file:
+        for line_number, line in enumerate(file):
+            line = line.strip()
+            if line:
+                words = line.split()
+                key = line_number
+                value = words[0]
+                result[key] = value
+    return result
+
+
+def set_recursively(key, value, full_name, weight_type, hf_pointer):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    # fairseq uses nn.utils.weight_norm() while transformers switches to nn.utils.parametrizations.weight_norm()
+    # the mapping between two versions:
+    # https://github.com/pytorch/pytorch/blob/56935684c3dfad7841c83c719eeebecb560fe466/torch/nn/utils/parametrizations.py#L389-L395
+
+    if weight_type is not None and weight_type != "param":
+        if weight_type == "weight_g" and not hasattr(hf_pointer, "weight_g"):
+            hf_shape = hf_pointer.parametrizations.weight.original0.shape
+        elif weight_type == "weight_v" and not hasattr(hf_pointer, "weight_v"):
+            hf_shape = hf_pointer.parametrizations.weight.original1.shape
+        else:
+            hf_shape = getattr(hf_pointer, weight_type).shape
+    elif weight_type is not None and weight_type == "param":
+        shape_pointer = hf_pointer
+        for attribute in hf_param_name.split("."):
+            shape_pointer = getattr(shape_pointer, attribute)
+        hf_shape = shape_pointer.shape
+
+        # let's reduce dimension
+        value = value[0]
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        if hasattr(hf_pointer, "weight_g"):
+            hf_pointer.weight_g.data = value
+        else:
+            hf_pointer.parametrizations.weight.original0.data = value
+    elif weight_type == "weight_v":
+        if hasattr(hf_pointer, "weight_v"):
+            hf_pointer.weight_v.data = value
+        else:
+            hf_pointer.parametrizations.weight.original1.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "param":
+        for attribute in hf_param_name.split("."):
+            hf_pointer = getattr(hf_pointer, attribute)
+        hf_pointer.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def rename_dict(key, value, full_name, weight_type, hf_dict):
+    hf_param_name = None
+    for param_key in PARAM_MAPPING.keys():
+        if full_name.endswith(param_key):
+            hf_param_name = PARAM_MAPPING[full_name.split(".")[-1]]
+            weight_type = "param"
+
+    if weight_type is not None and weight_type != "param":
+        full_key = ".".join([key, weight_type])
+    elif weight_type is not None and weight_type == "param":
+        full_key = ".".join([key, hf_param_name])
+    else:
+        full_key = key
+
+    hf_dict[full_key] = value if "lm_head" in full_key else value[0]
+
+
+PARAM_MAPPING = {
+    "W_a": "linear_1.weight",
+    "W_b": "linear_2.weight",
+    "b_a": "linear_1.bias",
+    "b_b": "linear_2.bias",
+    "ln_W": "norm.weight",
+    "ln_b": "norm.bias",
+}
+
+
+def load_wav2vec2_layer(name, value, hf_model=None, hf_dict=None):
+    is_used = False
+    for key, mapped_key in MAPPING.items():
+        mapped_key = "wav2vec2." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+        if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+            is_used = True
+            if "*" in mapped_key:
+                layer_index = name.split(key)[0].split(".")[-2]
+                mapped_key = mapped_key.replace("*", layer_index)
+            if "weight_g" in name:
+                weight_type = "weight_g"
+            elif "weight_v" in name:
+                weight_type = "weight_v"
+            elif "bias" in name:
+                weight_type = "bias"
+            elif "weight" in name:
+                # TODO: don't match quantizer.weight_proj
+                weight_type = "weight"
+            else:
+                weight_type = None
+            if hf_dict is not None:
+                rename_dict(mapped_key, value, name, weight_type, hf_dict)
+            else:
+                set_recursively(mapped_key, value, name, weight_type, hf_model)
+            return is_used
+    return is_used
+
+
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+
+    feature_extractor = hf_model.wav2vec2.feature_extractor
+
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            is_used = load_wav2vec2_layer(name, value, hf_model)
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_wav2vec2_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True, is_seq_class=False
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Wav2Vec2Config.from_pretrained(config_path)
+    else:
+        config = Wav2Vec2Config()
+
+    if is_seq_class:
+        id2label = read_txt_into_dict(dict_path)
+        config.id2label = id2label
+        hf_wav2vec = Wav2Vec2ForSequenceClassification(config)
+        feature_extractor = Wav2Vec2FeatureExtractor(
+            feature_size=1,
+            sampling_rate=16000,
+            padding_value=0,
+            do_normalize=True,
+            return_attention_mask=True,
+        )
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+    elif is_finetuned:
+        if dict_path:
+            target_dict = Dictionary.load(dict_path)
+
+            # important change bos & pad token id since CTC symbol is <pad> and
+            # not <s> as in fairseq
+            config.bos_token_id = target_dict.pad_index
+            config.pad_token_id = target_dict.bos_index
+            config.eos_token_id = target_dict.eos_index
+            config.vocab_size = len(target_dict.symbols)
+            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
+            if not os.path.isdir(pytorch_dump_folder_path):
+                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
+                return
+            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+            vocab_dict = target_dict.indices
+
+            # fairseq has the <pad> and <s> switched
+            vocab_dict["<pad>"] = 0
+            vocab_dict["<s>"] = 1
+            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
+                json.dump(vocab_dict, vocab_handle)
+            tokenizer = Wav2Vec2CTCTokenizer(
+                vocab_path,
+                unk_token=target_dict.unk_word,
+                pad_token=target_dict.pad_word,
+                bos_token=target_dict.bos_word,
+                eos_token=target_dict.eos_word,
+                word_delimiter_token="|",
+                do_lower_case=False,
+            )
+            return_attention_mask = True if config.feat_extract_norm == "layer" else False
+            feature_extractor = Wav2Vec2FeatureExtractor(
+                feature_size=1,
+                sampling_rate=16000,
+                padding_value=0,
+                do_normalize=True,
+                return_attention_mask=return_attention_mask,
+            )
+            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor.save_pretrained(pytorch_dump_folder_path)
+
+        hf_wav2vec = Wav2Vec2ForCTC(config)
+    else:
+        hf_wav2vec = Wav2Vec2ForPreTraining(config)
+
+    if is_finetuned or is_seq_class:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
+        )
+    else:
+        task_arg = argparse.Namespace(task="audio_pretraining")
+        task = fairseq.tasks.setup_task(task_arg)
+
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
+
+    model = model[0].eval()
+
+    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    parser.add_argument(
+        "--is_seq_class",
+        action="store_true",
+        help="Whether the model to convert is a fine-tuned sequence classification model or not",
+    )
+    args = parser.parse_args()
+
+    is_finetuned = not args.not_finetuned and not args.is_seq_class
+    convert_wav2vec2_checkpoint(
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.dict_path,
+        is_finetuned,
+        args.is_seq_class,
+    )
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa33416c8bdc6d0a82c50c364438861626d26879
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/convert_wav2vec2_original_s3prl_checkpoint_to_pytorch.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Hubert checkpoint."""
+
+import argparse
+
+import torch
+
+from transformers import (
+    Wav2Vec2Config,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def convert_classification(base_model_name, hf_config, downstream_dict):
+    model = Wav2Vec2ForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
+    model.projector.weight.data = downstream_dict["projector.weight"]
+    model.projector.bias.data = downstream_dict["projector.bias"]
+    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
+    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
+    return model
+
+
+def convert_diarization(base_model_name, hf_config, downstream_dict):
+    model = Wav2Vec2ForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
+    model.classifier.weight.data = downstream_dict["model.linear.weight"]
+    model.classifier.bias.data = downstream_dict["model.linear.bias"]
+    return model
+
+
+def convert_xvector(base_model_name, hf_config, downstream_dict):
+    model = Wav2Vec2ForXVector.from_pretrained(base_model_name, config=hf_config)
+    model.projector.weight.data = downstream_dict["connector.weight"]
+    model.projector.bias.data = downstream_dict["connector.bias"]
+    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
+        model.tdnn[i].kernel.weight.data = downstream_dict[
+            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
+        ]
+        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
+
+    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
+    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
+    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
+    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
+    model.objective.weight.data = downstream_dict["objective.W"]
+    return model
+
+
+@torch.no_grad()
+def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+
+    downstream_dict = checkpoint["Downstream"]
+
+    hf_config = Wav2Vec2Config.from_pretrained(config_path)
+    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+        base_model_name, return_attention_mask=True, do_normalize=False
+    )
+
+    arch = hf_config.architectures[0]
+    if arch.endswith("ForSequenceClassification"):
+        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
+    elif arch.endswith("ForAudioFrameClassification"):
+        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
+    elif arch.endswith("ForXVector"):
+        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
+    else:
+        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
+
+    if hf_config.use_weighted_layer_sum:
+        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
+
+    hf_feature_extractor.save_pretrained(model_dump_path)
+    hf_model.save_pretrained(model_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
+    )
+    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
+    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
+    args = parser.parse_args()
+    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dde386b32af9b8b46308d0dc7c9e879fbff054a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for Wav2Vec2
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a Wav2Vec2 feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+            improve the performance for some models, *e.g.*,
+            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.
+
+            <Tip>
+
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
+            should be passed.
+
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
+            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
+            passed for batched inference.
+
+            </Tip>"""
+
+    model_input_names = ["input_values", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=1,
+        sampling_rate=16000,
+        padding_value=0.0,
+        return_attention_mask=False,
+        do_normalize=True,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+
+    @staticmethod
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
+
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
+
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
+                [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+                `attention_mask` should be passed.
+
+                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
+                [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
+                be passed for batched inference.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, *optional*, defaults to 0.0):
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_values": raw_speech})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        # convert input values to correct format
+        input_values = padded_inputs["input_values"]
+        if not isinstance(input_values[0], np.ndarray):
+            padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
+        elif (
+            not isinstance(input_values, np.ndarray)
+            and isinstance(input_values[0], np.ndarray)
+            and input_values[0].dtype is np.dtype(np.float64)
+        ):
+            padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
+        elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
+            padded_inputs["input_values"] = input_values.astype(np.float32)
+
+        # convert attention_mask to correct format
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        # zero-mean and unit-variance normalization
+        if self.do_normalize:
+            attention_mask = (
+                attention_mask
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                else None
+            )
+            padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
+                padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["Wav2Vec2FeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..547076205018b7a15e2d7785eeea8c9cc9eed109
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -0,0 +1,1428 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax Wav2Vec2 model."""
+
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_wav2vec2 import Wav2Vec2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+@flax.struct.dataclass
+class FlaxWav2Vec2BaseModelOutput(ModelOutput):
+    """
+    Output type of [`FlaxWav2Vec2BaseModelOutput`], with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`jnp.ndarray` of shape `(batch_size, sequence_length, last_conv_dim)`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model with `last_conv_dim`
+            being the dimension of the last convolutional layer.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: jnp.ndarray = None
+    extract_features: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+    attentions: Optional[Tuple[jnp.ndarray]] = None
+
+
+@flax.struct.dataclass
+class FlaxWav2Vec2ForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`FlaxWav2Vec2ForPreTrainingOutput`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when model is in train mode, `jnp.ndarray` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`jnp.ndarray` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`jnp.ndarray` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projected_states: jnp.ndarray = None
+    projected_quantized_states: jnp.ndarray = None
+    codevector_perplexity: jnp.ndarray = None
+    hidden_states: Optional[Tuple[jnp.ndarray]] = None
+    attentions: Optional[Tuple[jnp.ndarray]] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[np.ndarray] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        mask_prob:
+            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
+            f" `sequence_length`: {sequence_length}`"
+        )
+
+    # compute number of masked spans in batch
+    num_masked_spans = int(mask_prob * sequence_length / mask_length + np.random.rand(1).item())
+    num_masked_spans = max(num_masked_spans, min_masks)
+
+    # make sure num masked indices <= sequence_length
+    if num_masked_spans * mask_length > sequence_length:
+        num_masked_spans = sequence_length // mask_length
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+
+    # get random indices to mask
+    spec_aug_mask_idxs = np.array(
+        [
+            np.random.choice(np.arange(sequence_length - (mask_length - 1)), num_masked_spans, replace=False)
+            for _ in range(batch_size)
+        ]
+    )
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(spec_aug_mask_idxs[:, :, None], (batch_size, num_masked_spans, mask_length))
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, num_masked_spans * mask_length)
+
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, num_masked_spans, mask_length)).reshape(
+        batch_size, num_masked_spans * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    if attention_mask is not None:
+        # make sure padded input ids cannot be masked
+        spec_aug_mask = np.where(attention_mask, spec_aug_mask, False)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(features_shape: Tuple, num_negatives: int, attention_mask: Optional[np.ndarray] = None):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length, hidden_size = features_shape
+    if sequence_length <= 1:
+        raise ValueError(
+            "`features should have `sequence_length` > 1, but are of shape "
+            f"(batch_size, sequence_length, hidden_size) = ({batch_size, sequence_length, hidden_size})."
+        )
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = []
+    for batch_idx in range(batch_size):
+        high = attention_mask[batch_idx].sum() - 1 if attention_mask is not None else sequence_length - 1
+        sampled_indices_slice = np.random.randint(0, high, size=(num_negatives * sequence_length,))
+        sampled_negative_indices.append(sampled_indices_slice)
+
+    sampled_negative_indices = np.asarray(sampled_negative_indices, dtype=np.int32)
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    feature_indices = np.broadcast_to(np.arange(sequence_length)[:, None], (sequence_length, num_negatives)).flatten()
+
+    # avoid sampling the same positive vector, but keep the distribution uniform
+    sampled_negative_indices[sampled_negative_indices >= feature_indices] += 1
+
+    # correct for batch size
+    for batch_idx in range(1, batch_size):
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+WAV2VEC2_START_DOCSTRING = r"""
+    Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+
+WAV2VEC2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `jnp.ndarray`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask) .. warning:: `attention_mask` should only be passed
+            if the corresponding processor has `config.return_attention_mask == True`. For all models whose processor
+            has `config.return_attention_mask == False`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+        mask_time_indices (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxWav2Vec2LayerNormConvLayer(nn.Module):
+    config: Wav2Vec2Config
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.in_conv_dim = self.config.conv_dim[self.layer_id] if self.layer_id > 0 else 1
+        self.out_conv_dim = self.config.conv_dim[self.layer_id]
+
+        self.conv = nn.Conv(
+            features=self.config.conv_dim[self.layer_id],
+            kernel_size=(self.config.conv_kernel[self.layer_id],),
+            strides=(self.config.conv_stride[self.layer_id],),
+            use_bias=self.config.conv_bias,
+            kernel_init=jax.nn.initializers.he_normal(),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.feat_extract_activation]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class FlaxConvWithWeightNorm(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            features=self.config.hidden_size,
+            kernel_size=(self.config.num_conv_pos_embeddings,),
+            kernel_init=jax.nn.initializers.he_normal(),
+            padding="VALID",
+            feature_group_count=self.config.num_conv_pos_embedding_groups,
+            dtype=self.dtype,
+        )
+        weight_shape = (
+            self.conv.features,
+            self.conv.features // self.conv.feature_group_count,
+            self.conv.kernel_size[0],
+        )
+        self.weight_v = self.param("weight_v", jax.nn.initializers.he_normal(), weight_shape)
+        self.weight_g = self.param("weight_g", lambda _: jnp.linalg.norm(self.weight_v, axis=(0, 1))[None, None, :])
+        self.bias = self.param("bias", jax.nn.initializers.zeros, (self.conv.features,))
+        self.prev_padding = self.conv.kernel_size[0] // 2
+
+    def _get_normed_weights(self):
+        weight_v_norm = jnp.linalg.norm(self.weight_v, axis=(0, 1))[None, None, :]
+        normed_weight_v = jnp.divide(self.weight_v, weight_v_norm)
+        normed_kernel = jnp.multiply(normed_weight_v, self.weight_g)
+        return normed_kernel
+
+    def __call__(self, hidden_states):
+        kernel = self._get_normed_weights()
+        hidden_states = jnp.pad(hidden_states, ((0, 0), (self.prev_padding, self.prev_padding), (0, 0)))
+        hidden_states = self.conv.apply({"params": {"kernel": kernel.T, "bias": self.bias}}, hidden_states)
+        return hidden_states
+
+
+class FlaxWav2Vec2PositionalConvEmbedding(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = FlaxConvWithWeightNorm(self.config, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.feat_extract_activation]
+        self.num_pad_remove = 1 if self.config.num_conv_pos_embeddings % 2 == 0 else 0
+
+    def __call__(self, hidden_states):
+        hidden_states = hidden_states.transpose((0, 1, 2))
+
+        hidden_states = self.conv(hidden_states)
+
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, : -self.num_pad_remove, :]
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose((0, 1, 2))
+        return hidden_states
+
+
+class FlaxConvLayersCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        if self.config.feat_extract_norm == "layer":
+            self.layers = [
+                FlaxWav2Vec2LayerNormConvLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_feat_extract_layers)
+            ]
+        elif self.config.feat_extract_norm == "group":
+            raise NotImplementedError("At the moment only ``config.feat_extact_norm == 'layer'`` is supported")
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {self.config.feat_extract_norm}, but has to be one of ['group',"
+                " 'layer']"
+            )
+
+    def __call__(self, hidden_states):
+        for i, conv_layer in enumerate(self.layers):
+            hidden_states = conv_layer(hidden_states)
+        return hidden_states
+
+
+class FlaxWav2Vec2FeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv_layers = FlaxConvLayersCollection(self.config, dtype=self.dtype)
+
+    def __call__(self, input_values, freeze_feature_encoder=False):
+        hidden_states = input_values[:, :, None]
+        hidden_states = self.conv_layers(hidden_states)
+        if freeze_feature_encoder:
+            hidden_states = jax.lax.stop_gradient(hidden_states)
+        return hidden_states
+
+
+class FlaxWav2Vec2FeatureProjection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.projection = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.feat_proj_dropout)
+
+    def __call__(self, hidden_states, deterministic=True):
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states, norm_hidden_states
+
+
+class FlaxWav2Vec2Attention(nn.Module):
+    config: Wav2Vec2Config
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        """Input shape: Batch x Time x Channel"""
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        if attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class FlaxWav2Vec2FeedForward(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.intermediate_dropout = nn.Dropout(rate=self.config.activation_dropout)
+
+        self.intermediate_dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        if isinstance(self.config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[self.config.hidden_act]
+        else:
+            self.intermediate_act_fn = self.config.hidden_act
+
+        self.output_dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.output_dropout = nn.Dropout(rate=self.config.hidden_dropout)
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states, deterministic=deterministic)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxWav2Vec2EncoderLayerStableLayerNorm(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.attention = FlaxWav2Vec2Attention(
+            config=self.config,
+            embed_dim=self.config.hidden_size,
+            num_heads=self.config.num_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.feed_forward = FlaxWav2Vec2FeedForward(self.config, dtype=self.dtype)
+        self.final_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_mask=None, deterministic=True, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights = self.attention(
+            hidden_states, attention_mask=attention_mask, deterministic=deterministic
+        )
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(
+            self.final_layer_norm(hidden_states), deterministic=deterministic
+        )
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class FlaxWav2Vec2EncoderLayerStableLayerNormCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = [
+            FlaxWav2Vec2EncoderLayerStableLayerNorm(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class FlaxWav2Vec2StableLayerNormEncoder(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.pos_conv_embed = FlaxWav2Vec2PositionalConvEmbedding(self.config, dtype=self.dtype)
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout)
+        self.layers = FlaxWav2Vec2EncoderLayerStableLayerNormCollection(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            hidden_states = jnp.where(
+                jnp.broadcast_to(attention_mask[:, :, None], hidden_states.shape), hidden_states, 0
+            )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = self.layer_norm(outputs[0])
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_state,)
+
+        if not return_dict:
+            outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=outputs.attentions
+        )
+
+
+class FlaxWav2Vec2GumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.num_groups = self.config.num_codevector_groups
+        self.num_vars = self.config.num_codevectors_per_group
+
+        if self.config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {self.config.codevector_dim} must be divisible by"
+                f" `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = self.param(
+            "codevectors",
+            jax.nn.initializers.uniform(),
+            (1, self.num_groups * self.num_vars, self.config.codevector_dim // self.num_groups),
+        )
+        self.weight_proj = nn.Dense(
+            self.num_groups * self.num_vars,
+            kernel_init=jax.nn.initializers.normal(1.0),
+            dtype=self.dtype,
+        )
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = jnp.broadcast_to(mask.flatten()[:, None, None], probs.shape)
+            probs = jnp.where(mask_extended, probs, jnp.zeros_like(probs))
+            marginal_probs = probs.sum(axis=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(axis=0)
+
+        perplexity = jnp.exp(-jnp.sum(marginal_probs * jnp.log(marginal_probs + 1e-7), axis=-1)).sum()
+        return perplexity
+
+    def __call__(self, hidden_states, mask_time_indices=None, deterministic=True, temperature=1):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.reshape(batch_size * sequence_length * self.num_groups, -1)
+
+        if not deterministic:
+            # sample code vector probs via gumbel in differentiateable way
+            gumbel_rng = self.make_rng("gumbel")
+            gumbels = jax.random.gumbel(gumbel_rng, hidden_states.shape)
+            codevector_probs = nn.softmax((hidden_states + gumbels) / temperature)
+
+            # compute perplexity
+            codevector_soft_dist = nn.softmax(
+                hidden_states.reshape(batch_size * sequence_length, self.num_groups, -1), axis=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(axis=-1)
+            codevector_probs = jax.nn.one_hot(codevector_idx, hidden_states.shape[-1]) * 1.0
+            codevector_probs = codevector_probs.reshape(batch_size * sequence_length, self.num_groups, -1)
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.reshape(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = jnp.expand_dims(codevector_probs, axis=-1) * self.codevectors
+        codevectors = codevectors_per_group.reshape(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).reshape(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class FlaxWav2Vec2Adapter(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        # hidden_states require down-projection if feature dims don't match
+        if self.config.output_hidden_size != self.config.hidden_size:
+            self.proj = nn.Dense(
+                self.config.output_hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+                dtype=self.dtype,
+            )
+            self.proj_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = FlaxWav2Vec2AdapterLayersCollection(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        # down-project hidden_states if required
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = self.layers(hidden_states)
+
+        return hidden_states
+
+
+class FlaxWav2Vec2AdapterLayer(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            features=2 * self.config.output_hidden_size,
+            kernel_size=(self.config.adapter_kernel_size,),
+            strides=(self.config.adapter_stride,),
+            padding=((1, 1),),
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.glu(hidden_states, axis=2)
+
+        return hidden_states
+
+
+class FlaxWav2Vec2AdapterLayersCollection(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.layers = [
+            FlaxWav2Vec2AdapterLayer(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.num_adapter_layers)
+        ]
+
+    def __call__(self, hidden_states):
+        for conv_layer in self.layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class FlaxWav2Vec2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2Config
+    base_model_prefix: str = "wav2vec2"
+    main_input_name = "input_values"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: Wav2Vec2Config,
+        input_shape: Tuple = (1, 1024),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_values = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_values)
+        params_rng, dropout_rng = jax.random.split(rng, 2)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_values, attention_mask, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        freeze_feature_encoder: bool = False,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_values.shape
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        return self.module.apply(
+            inputs,
+            jnp.array(input_values, dtype="f4"),
+            jnp.array(attention_mask, dtype="i4"),
+            mask_time_indices,
+            not train,
+            output_attentions,
+            output_hidden_states,
+            freeze_feature_encoder,
+            return_dict,
+            rngs=rngs,
+        )
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter)
+
+
+class FlaxWav2Vec2Module(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype)
+        self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
+        self.masked_spec_embed = self.param(
+            "masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
+        )
+
+        if self.config.do_stable_layer_norm:
+            self.encoder = FlaxWav2Vec2StableLayerNormEncoder(self.config, dtype=self.dtype)
+        else:
+            raise NotImplementedError("``config.do_stable_layer_norm is False`` is currently not supported.")
+
+        self.adapter = FlaxWav2Vec2Adapter(self.config, dtype=self.dtype) if self.config.add_adapter else None
+
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        deterministic=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        freeze_feature_encoder=False,
+        return_dict=None,
+    ):
+        extract_features = self.feature_extractor(input_values, freeze_feature_encoder=freeze_feature_encoder)
+
+        # make sure that no loss is computed on padded inputs
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features, deterministic=deterministic)
+        if mask_time_indices is not None:  # apply SpecAugment along time axis with given indices
+            hidden_states = jnp.where(
+                jnp.broadcast_to(mask_time_indices[:, :, None], hidden_states.shape),
+                jnp.broadcast_to(self.masked_spec_embed[None, None, :], hidden_states.shape),
+                hidden_states,
+            )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return FlaxWav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: jnp.ndarray, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = jnp.zeros((batch_size, feature_vector_length), dtype=attention_mask.dtype)
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask = attention_mask.at[jnp.arange(attention_mask.shape[0]), output_lengths - 1].set(1)
+        attention_mask = jnp.flip(jnp.flip(attention_mask, -1).cumsum(-1), -1).astype("bool")
+        return attention_mask
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_START_DOCSTRING,
+)
+class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
+    module_class = FlaxWav2Vec2Module
+
+
+FLAX_WAV2VEC2_MODEL_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, FlaxWav2Vec2Model
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
+
+    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
+
+
+    >>> def map_to_array(batch):
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
+
+
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
+
+    >>> input_values = processor(
+    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
+    ... ).input_values  # Batch size 1
+    >>> hidden_states = model(input_values).last_hidden_state
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWav2Vec2Model,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_MODEL_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxWav2Vec2Model, output_type=FlaxWav2Vec2BaseModelOutput, config_class=Wav2Vec2Config
+)
+
+
+class FlaxWav2Vec2ForCTCModule(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.wav2vec2 = FlaxWav2Vec2Module(self.config, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.final_dropout)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        deterministic=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        freeze_feature_encoder=False,
+        return_dict=None,
+    ):
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            mask_time_indices=mask_time_indices,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            freeze_feature_encoder=freeze_feature_encoder,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[2:]
+
+        return FlaxCausalLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+    def _get_feat_extract_output_lengths(
+        self,
+        input_lengths: Union[jnp.ndarray, int],
+        add_adapter: Optional[bool] = None,
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+
+@add_start_docstrings(
+    "Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).",
+    WAV2VEC2_START_DOCSTRING,
+)
+class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
+    module_class = FlaxWav2Vec2ForCTCModule
+
+
+FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
+
+    >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
+    >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
+
+
+    >>> def map_to_array(batch):
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
+
+
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
+
+    >>> input_values = processor(
+    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
+    ... ).input_values  # Batch size 1
+    >>> logits = model(input_values).logits
+    >>> predicted_ids = jnp.argmax(logits, axis=-1)
+
+    >>> transcription = processor.decode(predicted_ids[0])
+    >>> # should give:  "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWav2Vec2ForCTC,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_CTC_DOCSTRING,
+)
+append_replace_return_docstrings(FlaxWav2Vec2ForCTC, output_type=FlaxCausalLMOutput, config_class=Wav2Vec2Config)
+
+
+class FlaxWav2Vec2ForPreTrainingModule(nn.Module):
+    config: Wav2Vec2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.wav2vec2 = FlaxWav2Vec2Module(self.config, dtype=self.dtype)
+        self.dropout_features = nn.Dropout(self.config.feat_quantizer_dropout)
+
+        self.quantizer = FlaxWav2Vec2GumbelVectorQuantizer(self.config, dtype=self.dtype)
+        self.project_q = nn.Dense(
+            self.config.proj_codevector_dim,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.project_hid = nn.Dense(
+            self.config.proj_codevector_dim,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        gumbel_temperature: int = 1,
+        deterministic: bool = True,
+        output_attentions=None,
+        output_hidden_states=None,
+        freeze_feature_encoder=False,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            deterministic=deterministic,
+            freeze_feature_encoder=freeze_feature_encoder,
+            return_dict=return_dict,
+        )
+
+        # project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1], deterministic=deterministic)
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices, deterministic=deterministic, temperature=gumbel_temperature
+        )
+        quantized_features = self.project_q(quantized_features)
+
+        if not return_dict:
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return FlaxWav2Vec2ForPreTrainingOutput(
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+
+@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV2VEC2_START_DOCSTRING)
+class FlaxWav2Vec2ForPreTraining(FlaxWav2Vec2PreTrainedModel):
+    module_class = FlaxWav2Vec2ForPreTrainingModule
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    # overwrite since has `gumbel_temperature` input
+    def __call__(
+        self,
+        input_values,
+        attention_mask=None,
+        mask_time_indices=None,
+        gumbel_temperature: int = 1,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        gumbel_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        freeze_feature_encoder: bool = False,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_values.shape
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        if gumbel_rng is not None:
+            rngs["gumbel"] = gumbel_rng
+
+        inputs = {"params": params or self.params}
+
+        return self.module.apply(
+            inputs,
+            jnp.array(input_values, dtype="f4"),
+            jnp.array(attention_mask, dtype="i4"),
+            mask_time_indices,
+            gumbel_temperature,
+            not train,
+            output_attentions,
+            output_hidden_states,
+            freeze_feature_encoder,
+            return_dict,
+            rngs=rngs,
+        )
+
+
+FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> import optax
+    >>> import numpy as np
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining
+    >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
+
+
+    >>> def map_to_array(batch):
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
+
+
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
+
+    >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values  # Batch size 1
+
+    >>> # compute masked indices
+    >>> batch_size, raw_sequence_length = input_values.shape
+    >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+    >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
+
+    >>> outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+    >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+    >>> cosine_sim = optax.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states)
+
+    >>> # show that cosine similarity is much higher than random
+    >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWav2Vec2ForPreTraining,
+    WAV2VEC2_INPUTS_DOCSTRING + FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxWav2Vec2ForPreTraining, output_type=FlaxWav2Vec2ForPreTrainingOutput, config_class=Wav2Vec2Config
+)
+
+
+__all__ = ["FlaxWav2Vec2ForCTC", "FlaxWav2Vec2ForPreTraining", "FlaxWav2Vec2Model", "FlaxWav2Vec2PreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c385c192a987d5bd7425b6cd7c9ac6245c336c11
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -0,0 +1,1858 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow Wav2Vec2 model."""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import TFBaseModelOutput, TFCausalLMOutput, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_wav2vec2 import Wav2Vec2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+@dataclass
+class TFWav2Vec2BaseModelOutput(ModelOutput):
+    """
+    Output type of [`TFWav2Vec2BaseModelOutput`], with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        extract_features (`tf.Tensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
+            Sequence of extracted feature vectors of the last convolutional layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: Optional[tf.Tensor] = None
+    extract_features: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor] | None = None
+    attentions: Tuple[tf.Tensor] | None = None
+
+
+def _sample_without_replacement(distribution, num_samples):
+    """
+    Categorical sampling without replacement is currently not implemented. The gumbel-max trick will do for now - see
+    https://github.com/tensorflow/tensorflow/issues/9260 for more info
+    """
+    z = -tf.math.log(tf.random.uniform(shape_list(distribution), 0, 1))
+    _, indices = tf.nn.top_k(distribution + z, num_samples)
+    return indices
+
+
+def _scatter_values_on_batch_indices(values, batch_indices, output_shape):
+    """
+    Scatter function as in PyTorch with indices in format (batch_dim, indixes)
+    """
+    indices_shape = shape_list(batch_indices)
+    # broadcast batch dim to indices_shape
+    broad_casted_batch_dims = tf.reshape(
+        tf.broadcast_to(tf.expand_dims(tf.range(indices_shape[0]), axis=-1), indices_shape), [1, -1]
+    )
+    # transform batch_indices to pair_indices
+    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+    # scatter values to pair indices
+    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), output_shape)
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    min_masks: int = 0,
+) -> tf.Tensor:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        attention_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob:
+            probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+
+    Adapted from [fairseq's
+    data_utils.py](https://github.com/pytorch/fairseq/blob/e0788f7007a8473a76db573985031f3c94201e79/fairseq/data/data_utils.py#L376).
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    tf.debugging.assert_less(
+        mask_length,
+        sequence_length,
+        message=(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and"
+            f" `sequence_length`: {sequence_length}`"
+        ),
+    )
+
+    # compute number of masked spans in batch
+    num_masked_spans = mask_prob * tf.cast(sequence_length, tf.float32) / mask_length + tf.random.uniform((1,))
+    num_masked_spans = tf.maximum(num_masked_spans, min_masks)
+    num_masked_spans = tf.cast(num_masked_spans, tf.int32)
+
+    # make sure num masked indices <= sequence_length
+    num_masked_spans = tf.math.minimum(sequence_length // mask_length, num_masked_spans)
+    num_masked_spans = tf.squeeze(num_masked_spans)
+
+    # SpecAugment mask to fill
+    spec_aug_mask = tf.zeros((batch_size, sequence_length), dtype=tf.int32)
+
+    # uniform distribution to sample from, make sure that offset samples are < sequence_length
+    uniform_dist = tf.ones((batch_size, sequence_length - (mask_length - 1)))
+
+    # get random indices to mask
+    spec_aug_mask_idxs = _sample_without_replacement(uniform_dist, num_masked_spans)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = tf.expand_dims(spec_aug_mask_idxs, -1)
+    spec_aug_mask_idxs = tf.tile(spec_aug_mask_idxs, (1, 1, mask_length))
+    spec_aug_mask_idxs = tf.reshape(spec_aug_mask_idxs, (batch_size, num_masked_spans * mask_length))
+
+    offsets = tf.range(mask_length)[tf.newaxis, tf.newaxis, :]
+    offsets = tf.tile(offsets, (batch_size, num_masked_spans, 1))
+    offsets = tf.reshape(offsets, (batch_size, num_masked_spans * mask_length))
+
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # scatter indices to mask
+    spec_aug_mask = _scatter_values_on_batch_indices(
+        tf.ones_like(spec_aug_mask_idxs), spec_aug_mask_idxs, tf.shape(spec_aug_mask)
+    )
+
+    return spec_aug_mask
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+class TFWav2Vec2GroupNorm(keras.layers.Layer):
+    """
+    From tensorflow-addons https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization
+    """
+
+    def __init__(
+        self,
+        groups: int = 32,
+        axis: int = -1,
+        epsilon: float = 1e-3,
+        center: bool = True,
+        scale: bool = True,
+        beta_initializer: keras.initializers.Initializer = "zeros",
+        gamma_initializer: keras.initializers.Initializer = "ones",
+        beta_regularizer: keras.regularizers.Regularizer = None,
+        gamma_regularizer: keras.regularizers.Regularizer = None,
+        beta_constraint: keras.constraints.Constraint = None,
+        gamma_constraint: keras.constraints.Constraint = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.supports_masking = True
+        self.groups = groups
+        self.axis = axis
+        self.epsilon = epsilon
+        self.center = center
+        self.scale = scale
+        self.beta_initializer = keras.initializers.get(beta_initializer)
+        self.gamma_initializer = keras.initializers.get(gamma_initializer)
+        self.beta_regularizer = keras.regularizers.get(beta_regularizer)
+        self.gamma_regularizer = keras.regularizers.get(gamma_regularizer)
+        self.beta_constraint = keras.constraints.get(beta_constraint)
+        self.gamma_constraint = keras.constraints.get(gamma_constraint)
+        self._check_axis()
+
+    def build(self, input_shape):
+        self._check_if_input_shape_is_none(input_shape)
+        self._set_number_of_groups_for_instance_norm(input_shape)
+        self._check_size_of_dimensions(input_shape)
+        self._create_input_spec(input_shape)
+
+        self._add_gamma_weight(input_shape)
+        self._add_beta_weight(input_shape)
+        self.built = True
+        super().build(input_shape)
+
+    def call(self, inputs):
+        input_shape = keras.backend.int_shape(inputs)
+        tensor_input_shape = tf.shape(inputs)
+
+        reshaped_inputs, group_shape = self._reshape_into_groups(inputs, input_shape, tensor_input_shape)
+
+        normalized_inputs = self._apply_normalization(reshaped_inputs, input_shape)
+
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            outputs = tf.reshape(normalized_inputs, tensor_input_shape)
+        else:
+            outputs = normalized_inputs
+
+        return outputs
+
+    def get_config(self):
+        config = {
+            "groups": self.groups,
+            "axis": self.axis,
+            "epsilon": self.epsilon,
+            "center": self.center,
+            "scale": self.scale,
+            "beta_initializer": keras.initializers.serialize(self.beta_initializer),
+            "gamma_initializer": keras.initializers.serialize(self.gamma_initializer),
+            "beta_regularizer": keras.regularizers.serialize(self.beta_regularizer),
+            "gamma_regularizer": keras.regularizers.serialize(self.gamma_regularizer),
+            "beta_constraint": keras.constraints.serialize(self.beta_constraint),
+            "gamma_constraint": keras.constraints.serialize(self.gamma_constraint),
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+    def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape):
+        group_shape = [tensor_input_shape[i] for i in range(len(input_shape))]
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            group_shape[self.axis] = input_shape[self.axis] // self.groups
+            group_shape.insert(self.axis, self.groups)
+            group_shape = tf.stack(group_shape)
+            reshaped_inputs = tf.reshape(inputs, group_shape)
+            return reshaped_inputs, group_shape
+        else:
+            return inputs, group_shape
+
+    def _apply_normalization(self, reshaped_inputs, input_shape):
+        group_shape = keras.backend.int_shape(reshaped_inputs)
+        group_reduction_axes = list(range(1, len(group_shape)))
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            axis = -2 if self.axis == -1 else self.axis - 1
+        else:
+            axis = -1 if self.axis == -1 else self.axis - 1
+        group_reduction_axes.pop(axis)
+
+        mean, variance = tf.nn.moments(reshaped_inputs, group_reduction_axes, keepdims=True)
+
+        gamma, beta = self._get_reshaped_weights(input_shape)
+        normalized_inputs = tf.nn.batch_normalization(
+            reshaped_inputs,
+            mean=mean,
+            variance=variance,
+            scale=gamma,
+            offset=beta,
+            variance_epsilon=self.epsilon,
+        )
+        return normalized_inputs
+
+    def _get_reshaped_weights(self, input_shape):
+        broadcast_shape = self._create_broadcast_shape(input_shape)
+        gamma = None
+        beta = None
+        if self.scale:
+            gamma = tf.reshape(self.gamma, broadcast_shape)
+
+        if self.center:
+            beta = tf.reshape(self.beta, broadcast_shape)
+        return gamma, beta
+
+    def _check_if_input_shape_is_none(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim is None:
+            raise ValueError(
+                "Axis "
+                + str(self.axis)
+                + " of input tensor should have a defined dimension but the layer received an input with shape "
+                + str(input_shape)
+                + "."
+            )
+
+    def _set_number_of_groups_for_instance_norm(self, input_shape):
+        dim = input_shape[self.axis]
+
+        if self.groups == -1:
+            self.groups = dim
+
+    def _check_size_of_dimensions(self, input_shape):
+        dim = input_shape[self.axis]
+        if dim < self.groups:
+            raise ValueError(
+                "Number of groups ("
+                + str(self.groups)
+                + ") cannot be more than the number of channels ("
+                + str(dim)
+                + ")."
+            )
+
+        if dim % self.groups != 0:
+            raise ValueError(
+                "Number of groups ("
+                + str(self.groups)
+                + ") must be a multiple of the number of channels ("
+                + str(dim)
+                + ")."
+            )
+
+    def _check_axis(self):
+        if self.axis == 0:
+            raise ValueError(
+                "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
+            )
+
+    def _create_input_spec(self, input_shape):
+        dim = input_shape[self.axis]
+        self.input_spec = keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim})
+
+    def _add_gamma_weight(self, input_shape):
+        dim = input_shape[self.axis]
+        shape = (dim,)
+
+        if self.scale:
+            self.gamma = self.add_weight(
+                shape=shape,
+                name="gamma",
+                initializer=self.gamma_initializer,
+                regularizer=self.gamma_regularizer,
+                constraint=self.gamma_constraint,
+            )
+        else:
+            self.gamma = None
+
+    def _add_beta_weight(self, input_shape):
+        dim = input_shape[self.axis]
+        shape = (dim,)
+
+        if self.center:
+            self.beta = self.add_weight(
+                shape=shape,
+                name="beta",
+                initializer=self.beta_initializer,
+                regularizer=self.beta_regularizer,
+                constraint=self.beta_constraint,
+            )
+        else:
+            self.beta = None
+
+    def _create_broadcast_shape(self, input_shape):
+        broadcast_shape = [1] * len(input_shape)
+        is_instance_norm = (input_shape[self.axis] // self.groups) == 1
+        if not is_instance_norm:
+            broadcast_shape[self.axis] = input_shape[self.axis] // self.groups
+            broadcast_shape.insert(self.axis, self.groups)
+        else:
+            broadcast_shape[self.axis] = self.groups
+        return broadcast_shape
+
+
+class TFWav2Vec2WeightNormConv1D(keras.layers.Conv1D):
+    """Adapted from https://www.tensorflow.org/probability/api_docs/python/tfp/layers/weight_norm/WeightNorm"""
+
+    def __init__(self, filters, kernel_size, groups, explicit_padding, **kwargs):
+        super().__init__(
+            filters=filters,
+            kernel_size=kernel_size,
+            groups=groups,
+            padding="valid",
+            use_bias=True,
+            bias_initializer="he_normal",
+            **kwargs,
+        )
+        self.explicit_padding = explicit_padding
+        self.filter_axis = 2
+        self.kernel_norm_axes = tf.constant([0, 1])
+
+    def _init_norm(self):
+        """Set the norm of the weight vector."""
+        kernel_norm = tf.sqrt(tf.reduce_sum(tf.square(self.weight_v), axis=self.kernel_norm_axes))
+        self.weight_g.assign(kernel_norm[:, tf.newaxis, tf.newaxis])
+
+    def _normalize_kernel(self):
+        """Generate normalized weights."""
+        kernel = tf.nn.l2_normalize(self.weight_v, axis=self.kernel_norm_axes) * tf.transpose(self.weight_g)
+        self.kernel = tf.transpose(kernel)
+
+    def build(self, input_shape):
+        if not self.built:
+            super().build(input_shape)
+
+            self.kernel = tf.Variable(tf.transpose(self.kernel), name="weight_v", trainable=True)
+            self.weight_v = self.kernel
+
+            self.weight_g = self.add_weight(
+                name="weight_g",
+                shape=(int(self.weight_v.shape[self.filter_axis]), 1, 1),
+                initializer="ones",
+                dtype=self.weight_v.dtype,
+                trainable=True,
+            )
+            self._init_norm()
+            self.bias = self.add_weight(name="bias", shape=(self.filters,), initializer="zeros", trainable=True)
+
+    def call(self, inputs):
+        # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent.
+        #            This whole layer should be replaced by a layer that doesn't inherit from Conv1D, but instead calls
+        #            a functional 1d convolution with normalized weights that it generates (but does not store!)
+        self._normalize_kernel()
+
+        padded_inputs = tf.pad(inputs, ((0, 0), (self.explicit_padding, self.explicit_padding), (0, 0)))
+        output = super().call(padded_inputs)
+
+        return output
+
+
+class TFWav2Vec2NoLayerNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.activation = get_tf_activation(config.feat_extract_activation)
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+
+
+class TFWav2Vec2LayerNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.layer_norm = keras.layers.LayerNormalization(name="layer_norm", epsilon=config.layer_norm_eps)
+        self.activation = get_tf_activation(config.feat_extract_activation)
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.out_conv_dim])
+
+
+class TFWav2Vec2GroupNormConvLayer(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, layer_id: int = 0, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.in_conv_dim = config.conv_dim[layer_id] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = keras.layers.Conv1D(
+            filters=self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            strides=config.conv_stride[layer_id],
+            use_bias=config.conv_bias,
+            name="conv",
+        )
+        self.activation = get_tf_activation(config.feat_extract_activation)
+        self.layer_norm = TFWav2Vec2GroupNorm(
+            groups=self.out_conv_dim, epsilon=config.layer_norm_eps, name="layer_norm"
+        )
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.in_conv_dim])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.out_conv_dim])
+
+
+class TFWav2Vec2PositionalConvEmbedding(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.conv = TFWav2Vec2WeightNormConv1D(
+            filters=config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            groups=config.num_conv_pos_embedding_groups,
+            explicit_padding=config.num_conv_pos_embeddings // 2,
+            name="conv",
+        )
+        self.padding = TFWav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = get_tf_activation(config.feat_extract_activation)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv", None) is not None:
+            with tf.name_scope(self.conv.name):
+                self.conv.build([None, None, self.config.hidden_size])
+
+
+class TFWav2Vec2SamePadLayer(keras.layers.Layer):
+    def __init__(self, num_conv_pos_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def call(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, : -self.num_pad_remove, :]
+        return hidden_states
+
+
+class TFWav2Vec2FeatureEncoder(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [TFWav2Vec2GroupNormConvLayer(config, layer_id=0, name=f"conv_layers.{0}")] + [
+                TFWav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1, name=f"conv_layers.{i + 1}")
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                TFWav2Vec2LayerNormConvLayer(config, layer_id=i, name=f"conv_layers.{i}")
+                for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = conv_layers
+
+    def call(self, input_values):
+        hidden_states = tf.expand_dims(input_values, -1)
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv_layers", None) is not None:
+            for conv_layer in self.conv_layers:
+                with tf.name_scope(conv_layer.name):
+                    conv_layer.build(None)
+
+
+class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
+class TFWav2Vec2FeatureProjection(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.projection = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="projection",
+        )
+        self.dropout = keras.layers.Dropout(rate=config.feat_proj_dropout)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states, norm_hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.conv_dim[-1]])
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, self.config.conv_dim[-1]])
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with TFBart->TFWav2Vec2
+class TFWav2Vec2Attention(keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+class TFWav2Vec2FeedForward(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.intermediate_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = keras.layers.Dense(
+            units=config.intermediate_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="intermediate_dense",
+        )
+        self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+
+        self.output_dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            bias_initializer="zeros",
+            name="output_dense",
+        )
+        self.output_dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states, training=training)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "intermediate_dense", None) is not None:
+            with tf.name_scope(self.intermediate_dense.name):
+                self.intermediate_dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "output_dense", None) is not None:
+            with tf.name_scope(self.output_dense.name):
+                self.output_dense.build([None, None, self.config.intermediate_size])
+
+
+class TFWav2Vec2EncoderLayer(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFWav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            name="attention",
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.feed_forward = TFWav2Vec2FeedForward(config, name="feed_forward")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, training=training
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "feed_forward", None) is not None:
+            with tf.name_scope(self.feed_forward.name):
+                self.feed_forward.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.config.hidden_size])
+
+
+class TFWav2Vec2EncoderLayerStableLayerNorm(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFWav2Vec2Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            name="attention",
+        )
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.feed_forward = TFWav2Vec2FeedForward(config, name="feed_forward")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, training=training
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "feed_forward", None) is not None:
+            with tf.name_scope(self.feed_forward.name):
+                self.feed_forward.build(None)
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.config.hidden_size])
+
+
+class TFWav2Vec2Encoder(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.pos_conv_embed = TFWav2Vec2PositionalConvEmbedding(config, name="pos_conv_embed")
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer = [TFWav2Vec2EncoderLayer(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        training: Optional[bool] = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
+            attention_mask = _expand_mask(attention_mask)
+        else:
+            attention_mask = None
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            if training and (dropout_probability < self.config.layerdrop):  # skip the layer
+                continue
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pos_conv_embed", None) is not None:
+            with tf.name_scope(self.pos_conv_embed.name):
+                self.pos_conv_embed.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFWav2Vec2EncoderStableLayerNorm(keras.layers.Layer):
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.pos_conv_embed = TFWav2Vec2PositionalConvEmbedding(config, name="pos_conv_embed")
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout)
+        self.layer = [
+            TFWav2Vec2EncoderLayerStableLayerNorm(config, name=f"layers.{i}") for i in range(config.num_hidden_layers)
+        ]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        training: Optional[bool] = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            hidden_states = hidden_states * tf.expand_dims(attention_mask, -1)
+            attention_mask = _expand_mask(attention_mask)
+        else:
+            attention_mask = None
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            if training and (dropout_probability < self.config.layerdrop):  # skip the layer
+                continue
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "pos_conv_embed", None) is not None:
+            with tf.name_scope(self.pos_conv_embed.name):
+                self.pos_conv_embed.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFWav2Vec2MainLayer(keras.layers.Layer):
+    config_class = Wav2Vec2Config
+
+    def __init__(self, config: Wav2Vec2Config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.feature_extractor = TFWav2Vec2FeatureEncoder(config, name="feature_extractor")
+        self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection")
+
+        if config.do_stable_layer_norm:
+            self.encoder = TFWav2Vec2EncoderStableLayerNorm(config, name="encoder")
+        else:
+            self.encoder = TFWav2Vec2Encoder(config, name="encoder")
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if self.config.mask_time_prob > 0.0 or self.config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = self.add_weight(
+                shape=(self.config.hidden_size,), initializer="uniform", trainable=True, name="masked_spec_embed"
+            )
+        if getattr(self, "feature_extractor", None) is not None:
+            with tf.name_scope(self.feature_extractor.name):
+                self.feature_extractor.build(None)
+        if getattr(self, "feature_projection", None) is not None:
+            with tf.name_scope(self.feature_projection.name):
+                self.feature_projection.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _mask_hidden_states(self, hidden_states: tf.Tensor, mask_time_indices: tf.Tensor | None = None):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+        batch_size, sequence_length, hidden_size = shape_list(hidden_states)
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states = tf.where(
+                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
+                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
+                hidden_states,
+            )
+
+        elif self.config.mask_time_prob > 0:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                min_masks=2,
+            )
+            hidden_states = tf.where(
+                tf.cast(mask_time_indices[:, :, tf.newaxis], tf.bool),
+                self.masked_spec_embed[tf.newaxis, tf.newaxis, :],
+                hidden_states,
+            )
+
+        # apply SpecAugment along feature axis
+        if self.config.mask_feature_prob > 0:
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+            )
+            hidden_states = tf.where(mask_feature_indices[:, tf.newaxis, :], hidden_states, 0)
+
+        return hidden_states
+
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+        **kwargs: Any,
+    ):
+        extract_features = self.feature_extractor(tf.cast(input_values, tf.float32), training=training)
+        # extract_features = tf.transpose(extract_features, perm=(0, 2, 1))
+
+        if attention_mask is not None:
+            # compute real output lengths according to convolution formula
+            output_lengths = self._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, -1))
+
+            attention_mask = tf.sequence_mask(
+                output_lengths, maxlen=shape_list(extract_features)[1], dtype=extract_features.dtype
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features, training=training)
+
+        mask_time_indices = kwargs.get("mask_time_indices", None)
+        if training:
+            hidden_states = self._mask_hidden_states(hidden_states, mask_time_indices=mask_time_indices)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return TFWav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFWav2Vec2PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2Config
+    base_model_prefix = "wav2vec2"
+    main_input_name = "input_values"
+
+    @property
+    def input_signature(self):
+        return {
+            "input_values": tf.TensorSpec((None, None), tf.float32, name="input_values"),
+            "attention_mask": tf.TensorSpec((None, None), tf.float32, name="attention_mask"),
+        }
+
+    @property
+    def dummy_inputs(self):
+        return {
+            "input_values": tf.random.uniform(shape=(1, 500), dtype=tf.float32),
+            "attention_mask": tf.ones(shape=(1, 500), dtype=tf.float32),
+        }
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        logger.warning(
+            f"\n{self.__class__.__name__} has backpropagation operations that are NOT supported on CPU. If you wish "
+            "to train/fine-tune this model, you need a GPU or a TPU"
+        )
+
+    def _get_feat_extract_output_lengths(self, input_lengths, add_adapter=None):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return tf.math.floordiv(input_length - kernel_size, stride) + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: tf.Tensor, add_adapter=None
+    ):
+        non_padded_lengths = tf.math.cumsum(attention_mask, axis=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = tf.cast(output_lengths, tf.int32)
+        batch_size = tf.shape(attention_mask)[0]
+        # check device here
+        attention_mask = tf.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, name="attention_mask"
+        )  # these two operations makes sure that all values before the output lengths idxs are attended to
+        ## check device
+        attention_mask = tf.tensor_scatter_nd_update(
+            attention_mask,
+            indices=tf.stack([tf.range(batch_size), output_lengths - 1], axis=1),
+            updates=tf.ones([batch_size], dtype=attention_mask.dtype),
+        )
+        attention_mask = tf.reverse(attention_mask, axis=[-1])
+        attention_mask = tf.cumsum(attention_mask, axis=-1)
+        attention_mask = tf.reverse(attention_mask, axis=[-1])
+        attention_mask = tf.cast(attention_mask, tf.bool)
+        return attention_mask
+
+
+WAV2VEC2_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_values` only and nothing else: `model(input_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_values, attention_mask])` or `model([input_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_values": input_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAV2VEC2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` `Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_values` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_values` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare TFWav2Vec2 Model transformer outputing raw hidden-states without any specific head on top.",
+    WAV2VEC2_START_DOCSTRING,
+)
+class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+        self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        """
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFWav2Vec2Model
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
+
+        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
+        output_attentions = output_attentions if output_attentions else self.config.output_attentions
+        return_dict = return_dict if return_dict else self.config.return_dict
+
+        outputs = self.wav2vec2(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wav2vec2", None) is not None:
+            with tf.name_scope(self.wav2vec2.name):
+                self.wav2vec2.build(None)
+
+
+@add_start_docstrings(
+    """TFWav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_START_DOCSTRING,
+)
+class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
+        self.dropout = keras.layers.Dropout(config.final_dropout)
+        self.lm_head = keras.layers.Dense(config.vocab_size, name="lm_head")
+        self.output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor.trainable = False
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFCausalLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        labels: tf.Tensor | None = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFCausalLMOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_values` docstring) Tokens with indices set to `-100` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> logits = model(input_values).logits
+        >>> predicted_ids = tf.argmax(logits, axis=-1)
+
+        >>> transcription = processor.decode(predicted_ids[0])
+
+        >>> # compute loss
+        >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+
+        >>> # Pass transcription as `text` to encode labels
+        >>> labels = processor(text=transcription, return_tensors="tf").input_ids
+
+        >>> loss = model(input_values, labels=labels).loss
+        ```"""
+        if labels is not None and tf.reduce_max(labels) >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wav2vec2(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        logits = self.lm_head(hidden_states)
+
+        if labels is not None:
+            attention_mask = (
+                attention_mask if attention_mask is not None else tf.ones_like(input_values, dtype=tf.float32)
+            )
+            input_lengths = self.wav2vec2._get_feat_extract_output_lengths(tf.reduce_sum(attention_mask, axis=-1))
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = tf.cast(labels >= 0, tf.int32)
+            target_lengths = tf.reduce_sum(labels_mask, axis=-1)
+
+            loss = tf.nn.ctc_loss(
+                logits=logits,
+                labels=labels,
+                logit_length=input_lengths,
+                label_length=target_lengths,
+                blank_index=self.config.pad_token_id,
+                logits_time_major=False,
+            )
+
+            if self.config.ctc_loss_reduction == "sum":
+                loss = tf.reduce_sum(loss)
+            if self.config.ctc_loss_reduction == "mean":
+                loss = tf.reduce_mean(loss)
+
+            loss = tf.reshape(loss, (1,))
+        else:
+            loss = None
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wav2vec2", None) is not None:
+            with tf.name_scope(self.wav2vec2.name):
+                self.wav2vec2.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.output_hidden_size])
+
+
+class TFWav2Vec2ForSequenceClassification(TFWav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2 = TFWav2Vec2MainLayer(config, name="wav2vec2")
+        self.num_layers = config.num_hidden_layers + 1
+        with tf.name_scope(self._name_scope()):
+            if config.use_weighted_layer_sum:
+                self.layer_weights = self.add_weight(
+                    shape=(self.num_layers,), initializer="ones", trainable=True, name="layer_weights"
+                )
+        self.config = config
+        self.projector = keras.layers.Dense(units=config.classifier_proj_size, name="projector")
+        self.classifier = keras.layers.Dense(units=config.num_labels, activation=None, name="classifier")
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor.trainable = False
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for layer in self.wav2vec2.layers:
+            layer.trainable = False
+
+    @unpack_inputs
+    def call(
+        self,
+        input_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFSequenceClassifierOutput | Tuple[tf.Tensor]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = tf.stack(hidden_states, axis=1)
+            norm_weights = tf.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = tf.reduce_sum(hidden_states * tf.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = tf.reduce_mean(hidden_states, axis=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(shape_list(hidden_states)[1], attention_mask)
+            padding_mask_float = tf.cast(padding_mask, hidden_states.dtype)
+            hidden_states = tf.multiply(hidden_states, tf.expand_dims(padding_mask_float, axis=-1))
+            pooled_output = tf.divide(
+                tf.reduce_sum(hidden_states, axis=1), tf.expand_dims(tf.reduce_sum(padding_mask_float, axis=1), axis=1)
+            )
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+            loss = loss_fn(tf.reshape(labels, [-1]), tf.reshape(logits, [-1, self.config.num_labels]))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wav2vec2", None) is not None:
+            with tf.name_scope(self.wav2vec2.name):
+                self.wav2vec2.build(None)
+        if getattr(self, "projector", None) is not None:
+            with tf.name_scope(self.projector.name):
+                self.projector.build([None, None, self.config.hidden_size])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.classifier_proj_size])
+
+
+__all__ = ["TFWav2Vec2ForCTC", "TFWav2Vec2Model", "TFWav2Vec2PreTrainedModel", "TFWav2Vec2ForSequenceClassification"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac0e21486e7c49b22c7d3e78a87c046153610f8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -0,0 +1,2728 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Wav2Vec2 model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    MaskedLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    cached_file,
+    is_peft_available,
+    is_safetensors_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_wav2vec2 import Wav2Vec2Config
+
+
+WAV2VEC2_ADAPTER_PT_FILE = "adapter.{}.bin"
+WAV2VEC2_ADAPTER_SAFE_FILE = "adapter.{}.safetensors"
+
+if is_safetensors_available():
+    from safetensors.torch import load_file as safe_load_file
+
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2Config"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-base-960h"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 53.48
+
+# Audio class docstring
+_SEQ_CLASS_CHECKPOINT = "superb/wav2vec2-base-superb-ks"
+_SEQ_CLASS_EXPECTED_OUTPUT = "'_unknown_'"
+_SEQ_CLASS_EXPECTED_LOSS = 6.54
+
+# Frame class docstring
+_FRAME_CLASS_CHECKPOINT = "anton-l/wav2vec2-base-superb-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+# Speaker Verification docstring
+_XVECTOR_CHECKPOINT = "anton-l/wav2vec2-base-superb-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.98
+
+
+@dataclass
+class Wav2Vec2ForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+    mask_time_indices = (
+        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
+    )
+
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+    return sampled_negative_indices
+
+
+class Wav2Vec2NoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2LayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2GroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2PositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2SamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Wav2Vec2FeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
+    def __init__(self, config):
+        super().__init__(config)
+        warnings.warn(
+            f"The class `{self.__class__.__name__}` has been depreciated "
+            "and will be removed in Transformers v5. "
+            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+            FutureWarning,
+        )
+
+
+class Wav2Vec2FeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2
+class Wav2Vec2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[Wav2Vec2Config] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->Wav2Vec2
+class Wav2Vec2FlashAttention2(Wav2Vec2Attention):
+    """
+    Wav2Vec2 flash attention module. This module inherits from `Wav2Vec2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # Wav2Vec2FlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("Wav2Vec2FlashAttention2 attention does not support output_attentions")
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._reshape(self.q_proj(hidden_states), -1, bsz)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0].transpose(1, 2)
+            value_states = past_key_value[1].transpose(1, 2)
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
+            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
+        else:
+            # self_attention
+            key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=self.dropout if self.training else 0.0,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Wav2Vec2SdpaAttention(Wav2Vec2Attention):
+    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with Bart->Wav2Vec2
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Wav2Vec2Model is using Wav2Vec2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz)
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+WAV2VEC2_ATTENTION_CLASSES = {
+    "eager": Wav2Vec2Attention,
+    "sdpa": Wav2Vec2SdpaAttention,
+    "flash_attention_2": Wav2Vec2FlashAttention2,
+}
+
+
+class Wav2Vec2FeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2EncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2EncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = WAV2VEC2_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = Wav2Vec2FeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = Wav2Vec2AttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Wav2Vec2Encoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def forward(
+        self,
+        hidden_states: torch.tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+            if self._use_flash_attention_2:
+                # 2d mask is passed through the layers
+                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            else:
+                # extend attention_mask
+                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+                attention_mask = attention_mask.expand(
+                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+                )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2EncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states = hidden_states * expand_attention_mask.to(dtype=hidden_states.dtype)
+            if self._use_flash_attention_2:
+                # 2d mask is passed through the layers
+                attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            else:
+                # extend attention_mask
+                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+                attention_mask = attention_mask.expand(
+                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+                )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2GumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class Wav2Vec2Adapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2AdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class Wav2Vec2AttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2Config
+    base_model_prefix = "wav2vec2"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2GumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2PositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2FeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+    def _get_adapters(self):
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"{self.__class__} has no adapter layers. Make sure to define `config.adapter_attn_dim`.")
+
+        adapter_weights = {}
+        for name, module in self.named_modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                for param_name, param in module.named_parameters():
+                    adapter_weights[".".join([name, param_name])] = param
+
+        if isinstance(self, Wav2Vec2ForCTC):
+            for name, param in self.lm_head.named_parameters():
+                adapter_weights[".".join(["lm_head", name])] = param
+
+        return adapter_weights
+
+    def init_adapter_layers(self):
+        """
+        (Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
+        """
+        # init attention adapters
+        for module in self.modules():
+            if isinstance(module, Wav2Vec2AttnAdapterLayer):
+                self._init_weights(module)
+
+        # init lm head
+        if isinstance(self, Wav2Vec2ForCTC):
+            self._init_weights(self.lm_head)
+
+    def load_adapter(self, target_lang: str, force_load=True, **kwargs):
+        r"""
+        Load a language adapter model from a pre-trained adapter model.
+
+        Parameters:
+            target_lang (`str`):
+                Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
+                adapter.<lang>.safetensors or adapter.<lang>.bin
+            force_load (`bool`, defaults to `True`):
+                Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download:
+                Deprecated and ignored. All downloads are now resumed by default when possible.
+                Will be removed in v5 of Transformers.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.
+
+                </Tip>
+
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        Examples:
+
+        ```python
+        >>> from transformers import Wav2Vec2ForCTC, AutoProcessor
+
+        >>> ckpt = "facebook/mms-1b-all"
+        >>> processor = AutoProcessor.from_pretrained(ckpt)
+        >>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
+        >>> # set specific language
+        >>> processor.tokenizer.set_target_lang("spa")
+        >>> model.load_adapter("spa")
+        ```
+        """
+        if self.config.adapter_attn_dim is None:
+            raise ValueError(f"Cannot load_adapter for {target_lang} if `config.adapter_attn_dim` is not defined.")
+
+        if target_lang == self.target_lang and not force_load:
+            logger.warning(f"Adapter weights are already set to {target_lang}.")
+            return
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", None)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        model_path_or_id = self.config._name_or_path
+        state_dict = None
+
+        # 1. Let's first try loading a safetensors adapter weight
+        if use_safetensors is not False:
+            filepath = WAV2VEC2_ADAPTER_SAFE_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = safe_load_file(weight_path)
+
+            except EnvironmentError:
+                if use_safetensors:
+                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                    # to the original exception.
+                    raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                if use_safetensors:
+                    raise EnvironmentError(
+                        f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                        f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                        f" directory containing a file named {filepath}."
+                    )
+
+        # 2. If this didn't work let's try loading a PyTorch adapter weight
+        if state_dict is None:
+            filepath = WAV2VEC2_ADAPTER_PT_FILE.format(target_lang)
+
+            try:
+                weight_path = cached_file(
+                    model_path_or_id,
+                    filename=filepath,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                )
+
+                state_dict = torch.load(
+                    weight_path,
+                    map_location="cpu",
+                    weights_only=True,
+                )
+
+            except EnvironmentError:
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+
+            except Exception:
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{model_path_or_id}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{model_path_or_id}' is the correct path to a"
+                    f" directory containing a file named {filepath}."
+                )
+
+        adapter_weights = self._get_adapters()
+        unexpected_keys = set(state_dict.keys()) - set(adapter_weights.keys())
+        missing_keys = set(adapter_weights.keys()) - set(state_dict.keys())
+
+        if len(unexpected_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has unexpected keys: {', '.join(unexpected_keys)}.")
+        elif len(missing_keys) > 0:
+            raise ValueError(f"The adapter weights {weight_path} has missing keys: {', '.join(missing_keys)}.")
+
+        # make sure now vocab size is correct
+        target_vocab_size = state_dict["lm_head.weight"].shape[0]
+        if target_vocab_size != self.config.vocab_size:
+            self.lm_head = nn.Linear(
+                self.config.output_hidden_size, target_vocab_size, device=self.device, dtype=self.dtype
+            )
+            self.config.vocab_size = target_vocab_size
+
+        # make sure that adapter weights are put in exactly the same precision and device placement and overwritten adapter weights
+        state_dict = {k: v.to(adapter_weights[k]) for k, v in state_dict.items()}
+        self.load_state_dict(state_dict, strict=False)
+
+        # set target language corectly
+        self.target_lang = target_lang
+
+
+WAV2VEC2_START_DOCSTRING = r"""
+    Wav2Vec2 was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Wav2Vec2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+WAV2VEC2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), `attention_mask` should **not** be
+            passed to avoid degraded performance when doing batched inference. For such models `input_values` should
+            simply be padded with 0 and passed without `attention_mask`. Be aware that these models also yield slightly
+            different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_START_DOCSTRING,
+)
+class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+        self.feature_projection = Wav2Vec2FeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
+        else:
+            self.encoder = Wav2Vec2Encoder(config)
+
+        self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Wav2Vec2 Model with a quantizer and `VQ` head on top.""", WAV2VEC2_START_DOCSTRING)
+class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
+    def __init__(self, config: Wav2Vec2Config):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = Wav2Vec2GumbelVectorQuantizer(config)
+
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
+        >>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+        >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+
+        quantized_features = quantized_features.to(self.project_q.weight.dtype)
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Wav2Vec2ForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+@add_start_docstrings("""Wav2Vec2 Model with a `language modeling` head on top.""", WAV2VEC2_START_DOCSTRING)
+class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        warnings.warn(
+            "The class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.", FutureWarning
+        )
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_values: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2(
+            input_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return output
+
+        return MaskedLMOutput(logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
+            default.
+    """,
+)
+class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for Wav2Vec2 so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, Wav2Vec2 never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_START_DOCSTRING,
+)
+class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_SEQ_CLASS_CHECKPOINT,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_SEQ_CLASS_EXPECTED_OUTPUT,
+        expected_loss=_SEQ_CLASS_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_START_DOCSTRING,
+)
+class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_START_DOCSTRING,
+)
+class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2 = Wav2Vec2Model(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2ForAudioFrameClassification",
+    "Wav2Vec2ForCTC",
+    "Wav2Vec2ForMaskedLM",
+    "Wav2Vec2ForPreTraining",
+    "Wav2Vec2ForSequenceClassification",
+    "Wav2Vec2ForXVector",
+    "Wav2Vec2Model",
+    "Wav2Vec2PreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/processing_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/processing_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..077f5617198b9af4c93df88480938c5928834b17
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2
+"""
+
+import warnings
+from contextlib import contextmanager
+from typing import List, Optional, Union
+
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
+from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
+
+
+class Wav2Vec2ProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
+class Wav2Vec2Processor(ProcessorMixin):
+    r"""
+    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
+    processor.
+
+    [`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
+    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`Wav2Vec2FeatureExtractor`):
+            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "Wav2Vec2FeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        try:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        except (OSError, ValueError):
+            warnings.warn(
+                f"Loading a tokenizer inside {cls.__name__} from a config that does not"
+                " include a `tokenizer_class` attribute is deprecated and will be "
+                "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
+                " attribute to either your `config.json` or `tokenizer_config.json` "
+                "file to suppress this warning: ",
+                FutureWarning,
+            )
+
+            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+            return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    def __call__(
+        self,
+        audio: AudioInput = None,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        images=None,
+        videos=None,
+        **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
+    ):
+        """
+        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
+        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
+        [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
+        """
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        output_kwargs = self._merge_kwargs(
+            Wav2Vec2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(
+                audio,
+                **output_kwargs["audio_kwargs"],
+                **output_kwargs["text_kwargs"],
+                **output_kwargs["common_kwargs"],
+            )
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+        if text is not None:
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
+        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
+        [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Wav2Vec2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["Wav2Vec2Processor"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2/tokenization_wav2vec2.py b/docs/transformers/build/lib/transformers/models/wav2vec2/tokenization_wav2vec2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad51a4e4d028ff0c44ea630b022962f86e933d57
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -0,0 +1,924 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Wav2Vec2."""
+
+import json
+import os
+import warnings
+from dataclasses import dataclass
+from itertools import groupby
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import AddedToken, BatchEncoding
+from ...utils import (
+    ModelOutput,
+    PaddingStrategy,
+    TensorType,
+    add_end_docstrings,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+    to_py_obj,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp  # noqa: F401
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+
+# Wav2Vec2 has no max input length
+
+WAV2VEC2_KWARGS_DOCSTRING = r"""
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+"""
+
+ListOfDict = List[Dict[str, Union[int, str]]]
+
+
+@dataclass
+class Wav2Vec2CTCTokenizerOutput(ModelOutput):
+    """
+    Output type of [` Wav2Vec2CTCTokenizer`], with transcription.
+
+    Args:
+        text (list of `str` or `str`):
+            Decoded logits in text from. Usually the speech transcription.
+        char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
+            Offsets of the decoded characters. In combination with sampling rate and model downsampling rate char
+            offsets can be used to compute time stamps for each charater. Total logit score of the beam associated with
+            produced text.
+        word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
+            Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
+            can be used to compute time stamps for each word.
+    """
+
+    text: Union[List[str], str]
+    char_offsets: Union[List[ListOfDict], ListOfDict] = None
+    word_offsets: Union[List[ListOfDict], ListOfDict] = None
+
+
+class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Wav2Vec2CTC tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
+            The token used for defining the end of a word.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to accept lowercase input and lowercase the output when decoding.
+        target_lang (`str`, *optional*):
+            A target language the tokenizer should set by default. `target_lang` has to be defined for multi-lingual,
+            nested vocabulary such as [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all).
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        word_delimiter_token="|",
+        replace_word_delimiter_char=" ",
+        do_lower_case=False,
+        target_lang=None,
+        **kwargs,
+    ):
+        self._word_delimiter_token = word_delimiter_token
+
+        self.do_lower_case = do_lower_case
+        self.replace_word_delimiter_char = replace_word_delimiter_char
+        self.target_lang = target_lang
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.vocab = json.load(vocab_handle)
+
+        # if target lang is defined vocab must be a nested dict
+        # with each target lang being one vocabulary
+        if target_lang is not None:
+            self.encoder = self.vocab[target_lang]
+        else:
+            self.encoder = self.vocab
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            word_delimiter_token=word_delimiter_token,
+            replace_word_delimiter_char=replace_word_delimiter_char,
+            target_lang=target_lang,
+            **kwargs,
+        )
+
+        # make sure that tokens made of several
+        # characters are not split at tokenization
+        for token in self.encoder.keys():
+            if len(token) > 1:
+                self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
+
+    def set_target_lang(self, target_lang: str):
+        """
+        Set the target language of a nested multi-lingual dictionary
+        """
+        if self.vocab == self.encoder:
+            raise ValueError(f"{self.vocab} is not a multi-lingual, nested tokenizer. Cannot set target language.")
+
+        if target_lang not in self.vocab:
+            raise ValueError(f"{target_lang} does not exist. Choose one of {', '.join(self.vocab.keys())}.")
+
+        self.target_lang = target_lang
+        self.init_kwargs["target_lang"] = target_lang
+        self.encoder = self.vocab[target_lang]
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        # make sure that tokens made of several
+        # characters are not split at tokenization
+        for token in self.encoder.keys():
+            if len(token) > 1:
+                self.add_tokens(AddedToken(token, rstrip=True, lstrip=True, normalized=False))
+
+    @property
+    def word_delimiter_token(self) -> str:
+        """
+        `str`: Word delimiter token. Log an error if used while not having been set.
+        """
+        if self._word_delimiter_token is None and self.verbose:
+            logger.error("Using word_delimiter_token, but it is not set yet.")
+            return None
+        return str(self._word_delimiter_token)
+
+    @property
+    def word_delimiter_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self._word_delimiter_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.word_delimiter_token)
+
+    @word_delimiter_token.setter
+    def word_delimiter_token(self, value):
+        self._word_delimiter_token = value
+
+    @word_delimiter_token_id.setter
+    def word_delimiter_token_id(self, value):
+        self._word_delimiter_token = self.convert_tokens_to_ids(value)
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> Dict:
+        vocab = dict(self.encoder)
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        # Overwritten to never strip!
+        to_add = []
+        for token in new_tokens:
+            if isinstance(token, str):
+                to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalized=False))
+            else:
+                to_add.append(token)
+
+        return super()._add_tokens(to_add, special_tokens)
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string into a sequence of tokens (string), using the tokenizer.
+        """
+        if self.do_lower_case:
+            text = text.upper()
+
+        return list(text.replace(" ", self.word_delimiter_token))
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(
+        self,
+        tokens: List[str],
+        group_tokens: bool = True,
+        spaces_between_special_tokens: bool = False,
+        output_char_offsets: bool = False,
+        output_word_offsets: bool = False,
+    ) -> Dict[str, Union[str, float]]:
+        """
+        Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
+        """
+        if len(tokens) == 0:
+            return {"text": "", "char_offsets": [], "word_offsets": []}
+        # group same tokens into non-repeating tokens in CTC style decoding
+        if group_tokens:
+            chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
+        else:
+            chars = tokens
+            char_repetitions = len(tokens) * [1]
+
+        # filter self.pad_token which is used as CTC-blank token
+        processed_chars = list(filter(lambda char: char != self.pad_token, chars))
+
+        # replace delimiter token
+        processed_chars = [
+            self.replace_word_delimiter_char if char == self.word_delimiter_token else char for char in processed_chars
+        ]
+
+        # retrieve offsets
+        char_offsets = word_offsets = None
+        if output_char_offsets or output_word_offsets:
+            char_offsets = self._compute_offsets(char_repetitions, chars, self.pad_token)
+
+            if len(char_offsets) != len(processed_chars):
+                raise ValueError(
+                    f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
+                    " have to be of the same length, but are: "
+                    f"`len(offsets)`: {len(char_offsets)} and `len(processed_tokens)`:"
+                    f" {len(processed_chars)}"
+                )
+
+            # set tokens to correct processed token
+            for i, char in enumerate(processed_chars):
+                char_offsets[i]["char"] = char
+
+            # retrieve word offsets from character offsets
+            word_offsets = None
+            if output_word_offsets:
+                word_offsets = self._get_word_offsets(char_offsets, self.replace_word_delimiter_char)
+
+            # don't output chars if not set to True
+            if not output_char_offsets:
+                char_offsets = None
+
+        # join to string
+        join_char = " " if spaces_between_special_tokens else ""
+        string = join_char.join(processed_chars).strip()
+
+        if self.do_lower_case:
+            string = string.lower()
+
+        return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
+
+    @staticmethod
+    def _compute_offsets(
+        char_repetitions: List[int], chars: List[str], ctc_token: int
+    ) -> List[Dict[str, Union[str, int]]]:
+        end_indices = np.asarray(char_repetitions).cumsum()
+        start_indices = np.concatenate(([0], end_indices[:-1]))
+
+        offsets = [
+            {"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
+        ]
+
+        # filter out CTC token
+        offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
+        return offsets
+
+    @staticmethod
+    def _get_word_offsets(
+        offsets: Dict[str, Union[str, float]], word_delimiter_char: str = " "
+    ) -> Dict[str, Union[str, float]]:
+        word_offsets = []
+
+        last_state = "SPACE"
+        word = ""
+        start_offset = 0
+        end_offset = 0
+        for i, offset in enumerate(offsets):
+            char = offset["char"]
+            state = "SPACE" if char == word_delimiter_char else "WORD"
+
+            if state == last_state:
+                # If we are in the same state as before, we simply repeat what we've done before
+                end_offset = offset["end_offset"]
+                word += char
+            else:
+                # Switching state
+                if state == "SPACE":
+                    # Finishing a word
+                    word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+                else:
+                    # Starting a new word
+                    start_offset = offset["start_offset"]
+                    end_offset = offset["end_offset"]
+                    word = char
+
+            last_state = state
+        if last_state == "WORD":
+            word_offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+
+        return word_offsets
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        if is_split_into_words:
+            text = " " + text
+        return (text, kwargs)
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        group_tokens: bool = True,
+        spaces_between_special_tokens: bool = False,
+        output_word_offsets: Optional[bool] = False,
+        output_char_offsets: Optional[bool] = False,
+    ) -> str:
+        """
+        special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
+        same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
+        the whole token list and not individually on added tokens
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        result = []
+        for token in filtered_tokens:
+            if skip_special_tokens and (
+                token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens)
+            ):
+                continue
+            result.append(token)
+
+        string_output = self.convert_tokens_to_string(
+            result,
+            group_tokens=group_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            output_word_offsets=output_word_offsets,
+            output_char_offsets=output_char_offsets,
+        )
+
+        text = string_output["text"]
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = self.clean_up_tokenization(text)
+
+        if output_word_offsets or output_char_offsets:
+            return Wav2Vec2CTCTokenizerOutput(
+                text=text,
+                char_offsets=string_output["char_offsets"],
+                word_offsets=string_output["word_offsets"],
+            )
+        else:
+            return text
+
+    # overwritten from `tokenization_utils_base.py` because tokenizer can output
+    # `ModelOutput` which should not be a list for batched output and
+    # because we need docs for `output_char_offsets` here
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_char_offsets: bool = False,
+        output_word_offsets: bool = False,
+        **kwargs,
+    ) -> List[str]:
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces.
+            output_char_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output character offsets. Character offsets can be used in combination with the
+                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
+
+                <Tip>
+
+                Please take a look at the Example of [`~Wav2Vec2CTCTokenizer.decode`] to better understand how to make
+                use of `output_char_offsets`. [`~Wav2Vec2CTCTokenizer.batch_decode`] works the same way with batched
+                output.
+
+                </Tip>
+
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+
+                <Tip>
+
+                Please take a look at the Example of [`~Wav2Vec2CTCTokenizer.decode`] to better understand how to make
+                use of `output_word_offsets`. [`~Wav2Vec2CTCTokenizer.batch_decode`] works the same way with batched
+                output.
+
+                </Tip>
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `List[str]` or [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`]: The list of decoded
+            sentences. Will be a [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`] when
+            `output_char_offsets == True` or `output_word_offsets == True`.
+        """
+        batch_decoded = [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                output_char_offsets=output_char_offsets,
+                output_word_offsets=output_word_offsets,
+                **kwargs,
+            )
+            for seq in sequences
+        ]
+        if output_char_offsets or output_word_offsets:
+            # transform list of dicts to dict of lists
+            return Wav2Vec2CTCTokenizerOutput({k: [d[k] for d in batch_decoded] for k in batch_decoded[0]})
+
+        return batch_decoded
+
+    # overwritten from `tokenization_utils_base.py` because we need docs for `output_char_offsets`
+    # and `output_word_offsets` here
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_char_offsets: bool = False,
+        output_word_offsets: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces.
+            output_char_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output character offsets. Character offsets can be used in combination with the
+                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
+
+                <Tip>
+
+                Please take a look at the example below to better understand how to make use of `output_char_offsets`.
+
+                </Tip>
+
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+
+                <Tip>
+
+                Please take a look at the example below to better understand how to make use of `output_word_offsets`.
+
+                </Tip>
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str` or [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`]: The list of decoded
+            sentences. Will be a [`~models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizerOutput`] when
+            `output_char_offsets == True` or `output_word_offsets == True`.
+
+        Example:
+
+        ```python
+        >>> # Let's see how to retrieve time steps for a model
+        >>> from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
+        >>> from datasets import load_dataset
+        >>> import datasets
+        >>> import torch
+
+        >>> # import model, feature extractor, tokenizer
+        >>> model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
+
+        >>> # load first sample of English common_voice
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+        >>> dataset_iter = iter(dataset)
+        >>> sample = next(dataset_iter)
+
+        >>> # forward sample through model to get greedily predicted transcription ids
+        >>> input_values = feature_extractor(sample["audio"]["array"], return_tensors="pt").input_values
+        >>> logits = model(input_values).logits[0]
+        >>> pred_ids = torch.argmax(logits, axis=-1)
+
+        >>> # retrieve word stamps (analogous commands for `output_char_offsets`)
+        >>> outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
+        >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
+        >>> time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
+
+        >>> word_offsets = [
+        ...     {
+        ...         "word": d["word"],
+        ...         "start_time": round(d["start_offset"] * time_offset, 2),
+        ...         "end_time": round(d["end_offset"] * time_offset, 2),
+        ...     }
+        ...     for d in outputs.word_offsets
+        ... ]
+        >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer:
+        >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en
+        >>> word_offsets[:3]
+        [{'word': 'THE', 'start_time': 0.7, 'end_time': 0.78}, {'word': 'TRICK', 'start_time': 0.88, 'end_time': 1.08}, {'word': 'APPEARS', 'start_time': 1.2, 'end_time': 1.64}]
+        ```"""
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            output_char_offsets=output_char_offsets,
+            output_word_offsets=output_word_offsets,
+            **kwargs,
+        )
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
+
+
+class Wav2Vec2Tokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Wav2Vec2 tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
+            The token used for defining the end of a word.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the output when decoding.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+            improve the performance for some models, *e.g.*,
+            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2Tokenizer.__call__`] should return `attention_mask`.
+
+            <Tip>
+
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
+            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
+            should be passed.
+
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
+            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
+            passed for batched inference.
+
+            </Tip>
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = {
+        "vocab_file": {
+            "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
+        },
+        "tokenizer_config_file": {
+            "facebook/wav2vec2-base-960h": (
+                "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json"
+            ),
+        },
+    }
+    model_input_names = ["input_values", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        word_delimiter_token="|",
+        do_lower_case=False,
+        do_normalize=False,
+        return_attention_mask=False,
+        **kwargs,
+    ):
+        warnings.warn(
+            "The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use"
+            " `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
+            FutureWarning,
+        )
+
+        self._word_delimiter_token = word_delimiter_token
+
+        self.do_lower_case = do_lower_case
+        self.return_attention_mask = return_attention_mask
+        self.do_normalize = do_normalize
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            do_normalize=do_normalize,
+            return_attention_mask=return_attention_mask,
+            word_delimiter_token=word_delimiter_token,
+            **kwargs,
+        )
+
+    @property
+    def word_delimiter_token(self) -> str:
+        """
+        `str`: Padding token. Log an error if used while not having been set.
+        """
+        if self._word_delimiter_token is None and self.verbose:
+            logger.error("Using word_delimiter_token, but it is not set yet.")
+            return None
+        return str(self._word_delimiter_token)
+
+    @property
+    def word_delimiter_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self._word_delimiter_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.word_delimiter_token)
+
+    @word_delimiter_token.setter
+    def word_delimiter_token(self, value):
+        self._word_delimiter_token = value
+
+    @word_delimiter_token_id.setter
+    def word_delimiter_token_id(self, value):
+        self._word_delimiter_token = self.convert_tokens_to_ids(value)
+
+    @add_end_docstrings(WAV2VEC2_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences.
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy array or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+        """
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        # make sure input is in list format
+        if is_batched and not isinstance(raw_speech[0], np.ndarray):
+            raw_speech = [np.asarray(speech) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # zero-mean and unit-variance normalization
+        if self.do_normalize:
+            raw_speech = [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchEncoding({"input_values": raw_speech})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=self.return_attention_mask,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return padded_inputs
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> Dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
+        """
+        # group same tokens into non-repeating tokens in CTC style decoding
+        grouped_tokens = [token_group[0] for token_group in groupby(tokens)]
+
+        # filter self.pad_token which is used as CTC-blank token
+        filtered_tokens = list(filter(lambda token: token != self.pad_token, grouped_tokens))
+
+        # replace delimiter token
+        string = "".join([" " if token == self.word_delimiter_token else token for token in filtered_tokens]).strip()
+
+        if self.do_lower_case:
+            string = string.lower()
+
+        return string
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
+        """
+        special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
+        same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
+        the whole token list and not individually on added tokens
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        result = []
+        for token in filtered_tokens:
+            if skip_special_tokens and (
+                token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens)
+            ):
+                continue
+            result.append(token)
+
+        text = self.convert_tokens_to_string(result)
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
+
+
+__all__ = ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/__init__.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7520263c51bcd3b63747ee0a55e1baff24a777f9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wav2vec2_bert import *
+    from .modeling_wav2vec2_bert import *
+    from .processing_wav2vec2_bert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..db52cc5baed3675957c48abe18d50dd59faf4222
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wav2Vec2Bert model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to
+    instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert
+    [facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the Wav2Vec2Bert model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`Wav2Vec2BertModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2BertModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of this model, i.e the dimension after processing input audios with [`SeamlessM4TFeatureExtractor`] or [`Wav2Vec2BertProcessor`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the feature projection.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2BertForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2):
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if `mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks`.
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0):
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2BertForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2BertForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2BertForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        pad_token_id (`int`, *optional*, defaults to 0): The id of the _beginning-of-stream_ token.
+        bos_token_id (`int`, *optional*, defaults to 1): The id of the _padding_ token.
+        eos_token_id (`int`, *optional*, defaults to 2): The id of the _end-of-stream_ token.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very
+            useful for warm-starting Wav2Vec2Bert for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        adapter_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the adapter layers. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        use_intermediate_ffn_before_adapter (`bool`, *optional*, defaults to `False`):
+            Whether an intermediate feed-forward block should be stacked on top of the Wav2Vec2Bert Encoder and before the adapter network.
+             Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
+            Can be specified to :
+                - `rotary`, for rotary position embeddings.
+                - `relative`, for relative position embeddings.
+                - `relative_key`, for relative position embeddings as defined by Shaw in [Self-Attention
+            with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            If left to `None`, no relative position embeddings is applied.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        left_max_position_embeddings (`int`, *optional*, defaults to 64):
+            If `"relative_key"` (aka Shaw) position embeddings are used, defines the left clipping value for relative positions.
+        right_max_position_embeddings (`int`, *optional*, defaults to 8):
+            If `"relative_key"` (aka Shaw) position embeddings are used, defines the right clipping value for relative positions.
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
+        conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all convolutional layers in Conformer blocks.
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel
+
+    >>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration
+    >>> configuration = Wav2Vec2BertConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration
+    >>> model = Wav2Vec2BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wav2vec2-bert"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        feature_projection_input_dim=160,
+        hidden_act="swish",
+        hidden_dropout=0.0,
+        activation_dropout=0.0,
+        attention_dropout=0.0,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=768,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=1,
+        adapter_act="relu",
+        use_intermediate_ffn_before_adapter=False,
+        output_hidden_size=None,
+        position_embeddings_type="relative_key",
+        rotary_embedding_base=10000,
+        max_source_positions=5000,
+        left_max_position_embeddings=64,
+        right_max_position_embeddings=8,
+        conv_depthwise_kernel_size=31,
+        conformer_conv_dropout=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.max_source_positions = max_source_positions
+
+        if position_embeddings_type is not None and position_embeddings_type not in [
+            "rotary",
+            "relative",
+            "relative_key",
+        ]:
+            raise ValueError(
+                """
+                `position_embeddings_type` is not valid. It must be one of the following values:
+                `["rotary", "relative", "relative_key"]` or left as `None`.
+                """
+            )
+        self.position_embeddings_type = position_embeddings_type
+        self.rotary_embedding_base = rotary_embedding_base
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+
+        # Conformer-block related
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.conformer_conv_dropout = conformer_conv_dropout
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_act = adapter_act
+        self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size
+        if use_intermediate_ffn_before_adapter and not add_adapter:
+            raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.")
+        self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        ratio = self.feature_projection_input_dim * 2
+        if self.add_adapter:
+            ratio = ratio * (self.adapter_stride**self.num_adapter_layers)
+        return ratio
+
+
+__all__ = ["Wav2Vec2BertConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..33510654dcca5ec24940f744653df1e909db3fae
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2Bert BERT checkpoint."""
+
+import argparse
+
+import torch
+import torchaudio
+from fairseq2.data import Collater
+from fairseq2.data.audio import WaveformToFbankConverter
+from fairseq2.nn.padding import get_seqs_and_padding_mask
+from seamless_communication.models.conformer_shaw import load_conformer_shaw_model
+
+from transformers import (
+    SeamlessM4TFeatureExtractor,
+    Wav2Vec2BertConfig,
+    Wav2Vec2BertModel,
+    logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+wav2vec_convert_list = [
+    ("encoder_frontend.model_dim_proj", "feature_projection.projection"),
+    ("encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
+    ("encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
+    ("encoder.inner.layers", "encoder.layers"),
+    ("encoder.inner_layer_norm", "encoder.layer_norm"),
+    ("encoder.adaptor_layers", "adapter.layers"),
+    ("inner_proj", "intermediate_dense"),
+    ("self_attn.output_proj", "self_attn.linear_out"),
+    ("output_proj", "output_dense"),
+    ("self_attn.k_proj", "self_attn.linear_k"),
+    ("self_attn.v_proj", "self_attn.linear_v"),
+    ("self_attn.q_proj", "self_attn.linear_q"),
+    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
+    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
+    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
+    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
+    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
+    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
+    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
+    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
+    ("conv_layer_norm", "conv_module.layer_norm"),
+    ("encoder.proj1", "intermediate_ffn.intermediate_dense"),
+    ("encoder.proj2", "intermediate_ffn.output_dense"),
+    ("encoder.layer_norm", "inner_layer_norm"),
+    ("masker.temporal_mask_embed", "masked_spec_embed"),
+]
+
+keys_to_remove = {
+    "quantizer.entry_proj",
+    "final_proj",
+    "final_target_proj",
+    "quantizer.entries",
+    "quantizer.num_updates",
+}
+
+
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+
+
+def _convert_model(
+    original_model,
+    hf_model,
+    convert_list,
+):
+    state_dict = original_model.state_dict()
+
+    for k, v in list(state_dict.items()):
+        new_key = k
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_key:
+                new_key = new_key.replace(old_layer_name, new_layer_name)
+
+        # must do it by hand
+        if ".layer_norm" in new_key and new_key.split(".layer_norm")[0][-1].isnumeric():
+            new_key = new_key.replace("layer_norm", "final_layer_norm")
+
+        add_key = True
+        for key in keys_to_remove:
+            if key in new_key:
+                state_dict.pop(k)
+                add_key = False
+                break
+
+        if add_key:
+            state_dict[new_key] = state_dict.pop(k)
+
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    extra_keys = set({k for k in extra_keys if "num_updates" not in k})  # filter unecessary param
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=True)
+    n_params = param_count(hf_model)
+
+    logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params")
+
+    hf_model.eval()
+    del state_dict
+
+    return hf_model
+
+
+@torch.no_grad()
+def convert_wav2vec2_bert_checkpoint(
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Wav2Vec2BertConfig.from_pretrained(config_path, hidden_act="swish")
+    else:
+        config = Wav2Vec2BertConfig(apply_spec_augment=False)
+
+    hf_wav2vec = Wav2Vec2BertModel(config)
+
+    model = load_conformer_shaw_model(checkpoint_path, dtype=torch.float32)
+    model.eval()
+
+    hf_wav2vec = _convert_model(model, hf_wav2vec, wav2vec_convert_list)
+
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        hf_wav2vec.push_to_hub(repo_id, create_pr=True)
+
+    # save feature extractor
+    fe = SeamlessM4TFeatureExtractor(padding_value=1)
+    fe._set_processor_class("Wav2Vec2BertProcessor")
+    fe.save_pretrained(pytorch_dump_folder_path)
+
+    if repo_id:
+        fe.push_to_hub(repo_id, create_pr=True)
+
+    if args.audio_path:
+        waveform, sample_rate = torchaudio.load(args.audio_path)
+        waveform = torchaudio.functional.resample(waveform, sample_rate, fe.sampling_rate)
+
+        fbank_converter = WaveformToFbankConverter(
+            num_mel_bins=80,
+            waveform_scale=2**15,
+            channel_last=True,
+            standardize=True,
+            dtype=torch.float32,
+        )
+        collater = Collater(pad_value=1)
+
+        decoded_audio = {"waveform": waveform.T, "sample_rate": fe.sampling_rate, "format": -1}
+        src = collater(fbank_converter(decoded_audio))["fbank"]
+        seqs, padding_mask = get_seqs_and_padding_mask(src)
+
+        with torch.inference_mode():
+            seqs, padding_mask = model.encoder_frontend(seqs, padding_mask)
+            original_output, padding_mask = model.encoder(seqs, padding_mask)
+
+        hf_wav2vec.eval()
+
+        inputs = fe(waveform, return_tensors="pt", padding=True)
+        with torch.no_grad():
+            outputs = hf_wav2vec(**inputs)
+
+        torch.testing.assert_close(original_output, outputs.last_hidden_state, rtol=5e-3, atol=5e-3)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+    parser.add_argument(
+        "--checkpoint_path", default="conformer_shaw", type=str, help="Path to seamless communication checkpoint"
+    )
+    parser.add_argument(
+        "--config_path",
+        default=None,
+        type=str,
+        help="Path to hf config.json of model to convert",
+    )
+    parser.add_argument("--repo_id", default=None, type=str, help="Push to this repo id if precised.")
+    parser.add_argument(
+        "--audio_path",
+        default=None,
+        type=str,
+        help="If specified, check that the original model and the converted model produce the same outputs.",
+    )
+
+    args = parser.parse_args()
+    convert_wav2vec2_bert_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.repo_id
+    )
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3d393f9baa146a5fd6e17a0b30d09c65d742d0d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -0,0 +1,1625 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_bert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_peft_available,
+)
+from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
+
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2BertConfig"
+
+
+class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        # Ignore copy
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2BertRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2BertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2BertFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, hidden_size=None):
+        super().__init__()
+        act_fn = act_fn if act_fn is not None else config.hidden_act
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2BertConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution if attention mask is passed.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2BertSelfAttention(nn.Module):
+    """Construct an Wav2Vec2BertSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, is_adapter_attention=False):
+        super().__init__()
+        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
+
+        self.head_size = hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
+
+        self.linear_q = nn.Linear(hidden_size, hidden_size)
+        self.linear_k = nn.Linear(hidden_size, hidden_size)
+        self.linear_v = nn.Linear(hidden_size, hidden_size)
+        self.linear_out = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            scores = scores + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2BertEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn1 = Wav2Vec2BertFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2BertSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2BertConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn2 = Wav2Vec2BertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                        conv_attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                        conv_attention_mask=conv_attention_mask,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2BertAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
+        if seq_lens is None:
+            return seq_lens
+        pad = self.kernel_size // 2
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+        return seq_lens.floor()
+
+    def forward(self, hidden_states, attention_mask=None):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        sub_sampled_lengths = None
+        if attention_mask is not None:
+            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)
+
+        for layer in self.layers:
+            layerdrop_prob = torch.rand([])
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(
+                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
+                )
+
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+class Wav2Vec2BertAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.output_hidden_size
+        dropout = config.conformer_conv_dropout
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        sub_sampled_lengths: Optional[torch.Tensor] = None,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weigths = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2BertConfig
+    base_model_prefix = "wav2vec2_bert"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, Wav2Vec2BertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(
+            module,
+            (Wav2Vec2BertForSequenceClassification, Wav2Vec2BertForAudioFrameClassification, Wav2Vec2BertForXVector),
+        ):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+        elif isinstance(module, AMSoftmaxLoss):  # noqa: F821
+            module.weight.data.normal_()
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride, padding):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
+
+        if add_adapter:
+            padding = self.config.adapter_kernel_size // 2
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(
+                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
+                )
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
+_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
+
+
+WAV2VEC2_BERT_START_DOCSTRING = r"""
+    Wav2Vec2Bert was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Wav2Vec2BertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config: Wav2Vec2BertConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_projection = Wav2Vec2BertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2BertEncoder(config)
+
+        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None
+
+        self.intermediate_ffn = None
+        if config.use_intermediate_ffn_before_adapter:
+            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BertBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BertBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states, extract_features = self.feature_projection(input_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.intermediate_ffn:
+            expanded_hidden_states = self.intermediate_ffn(hidden_states)
+            hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BertBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 17.04
+
+
+@add_start_docstrings(
+    """Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2BertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+# Base docstring
+_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2BertForAudioFrameClassification",
+    "Wav2Vec2BertForCTC",
+    "Wav2Vec2BertForSequenceClassification",
+    "Wav2Vec2BertForXVector",
+    "Wav2Vec2BertModel",
+    "Wav2Vec2BertPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc1df08aa23809048fd4a2a9d59f3dcdf42f1fe5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -0,0 +1,1179 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2FeedForward, Wav2Vec2ForSequenceClassification, Wav2Vec2Model
+from ..wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerForAudioFrameClassification,
+    Wav2Vec2ConformerForCTC,
+    Wav2Vec2ConformerForXVector,
+    Wav2Vec2ConformerRelPositionalEmbedding,
+    Wav2Vec2ConformerRotaryPositionalEmbedding,
+    Wav2Vec2ConformerSelfAttention,
+)
+from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2BertConfig"
+
+# Base docstring
+_BASE_CHECKPOINT_FOR_DOC = "facebook/w2v-bert-2.0"
+_PRETRAINED_CHECKPOINT_FOR_DOC = "hf-audio/wav2vec2-bert-CV16-en"
+_EXPECTED_OUTPUT_SHAPE = [1, 146, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'mr quilter is the apostle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 17.04
+
+
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+class Wav2Vec2BertRotaryPositionalEmbedding(Wav2Vec2ConformerRotaryPositionalEmbedding, nn.Module):
+    def __init__(self, config):
+        nn.Module.__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        # Ignore copy
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+
+class Wav2Vec2BertRelPositionalEmbedding(Wav2Vec2ConformerRelPositionalEmbedding):
+    pass
+
+
+class Wav2Vec2BertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2BertFeedForward(Wav2Vec2FeedForward, nn.Module):
+    def __init__(self, config, act_fn=None, hidden_size=None):
+        nn.Module.__init__()
+        act_fn = act_fn if act_fn is not None else config.hidden_act
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+
+class Wav2Vec2BertConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution if attention mask is passed.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2BertSelfAttention(Wav2Vec2ConformerSelfAttention, nn.Module):
+    """Construct an Wav2Vec2BertSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, is_adapter_attention=False):
+        nn.Module.__init__()
+        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
+
+        self.head_size = hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
+
+        self.linear_q = nn.Linear(hidden_size, hidden_size)
+        self.linear_k = nn.Linear(hidden_size, hidden_size)
+        self.linear_v = nn.Linear(hidden_size, hidden_size)
+        self.linear_out = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            scores = scores + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+
+class Wav2Vec2BertEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn1 = Wav2Vec2BertFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2BertSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2BertConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn2 = Wav2Vec2BertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                        conv_attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                        conv_attention_mask=conv_attention_mask,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2BertAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
+        if seq_lens is None:
+            return seq_lens
+        pad = self.kernel_size // 2
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+        return seq_lens.floor()
+
+    def forward(self, hidden_states, attention_mask=None):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        sub_sampled_lengths = None
+        if attention_mask is not None:
+            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)
+
+        for layer in self.layers:
+            layerdrop_prob = torch.rand([])
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(
+                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
+                )
+
+        return hidden_states
+
+
+class Wav2Vec2BertAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.output_hidden_size
+        dropout = config.conformer_conv_dropout
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        sub_sampled_lengths: Optional[torch.Tensor] = None,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weigths = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2BertConfig
+    base_model_prefix = "wav2vec2_bert"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, Wav2Vec2BertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(
+            module,
+            (Wav2Vec2BertForSequenceClassification, Wav2Vec2BertForAudioFrameClassification, Wav2Vec2BertForXVector),
+        ):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+        elif isinstance(module, AMSoftmaxLoss):  # noqa: F821
+            module.weight.data.normal_()
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride, padding):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
+
+        if add_adapter:
+            padding = self.config.adapter_kernel_size // 2
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(
+                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
+                )
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+WAV2VEC2_BERT_START_DOCSTRING = None
+
+WAV2VEC2_BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertModel(Wav2Vec2Model, Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config: Wav2Vec2BertConfig):
+        Wav2Vec2BertPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_projection = Wav2Vec2BertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2BertEncoder(config)
+
+        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None
+
+        self.intermediate_ffn = None
+        if config.use_intermediate_ffn_before_adapter:
+            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BertBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BertBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states, extract_features = self.feature_projection(input_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.intermediate_ffn:
+            expanded_hidden_states = self.intermediate_ffn(hidden_states)
+            hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BertBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForCTC(Wav2Vec2ConformerForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_PRETRAINED_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Bert,wav2vec2->wav2vec2_bert,WAV_2_VEC_2->WAV2VEC2_BERT, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2ConformerForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForAudioFrameClassification.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Bert Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_BERT_START_DOCSTRING,
+)
+class Wav2Vec2BertForXVector(Wav2Vec2ConformerForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_BERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_BASE_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForXVector.forward with wav2vec2_conformer->wav2vec2_bert, input_values->input_features
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2BertForAudioFrameClassification",
+    "Wav2Vec2BertForCTC",
+    "Wav2Vec2BertForSequenceClassification",
+    "Wav2Vec2BertForXVector",
+    "Wav2Vec2BertModel",
+    "Wav2Vec2BertPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd40c94ea5b99c2334075ecf4fa5e49843ff140
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2-BERT
+"""
+
+import warnings
+from typing import List, Optional, Union
+
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
+from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
+
+
+class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
+class Wav2Vec2BertProcessor(ProcessorMixin):
+    r"""
+    Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
+    processor.
+
+    [`Wav2Vec2Processor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and [`PreTrainedTokenizer`].
+    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`SeamlessM4TFeatureExtractor`):
+            An instance of [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "SeamlessM4TFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        try:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        except OSError:
+            warnings.warn(
+                f"Loading a tokenizer inside {cls.__name__} from a config that does not"
+                " include a `tokenizer_class` attribute is deprecated and will be "
+                "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
+                " attribute to either your `config.json` or `tokenizer_config.json` "
+                "file to suppress this warning: ",
+                FutureWarning,
+            )
+
+            feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+            return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    def __call__(
+        self,
+        audio: AudioInput = None,
+        text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None,
+        images=None,
+        videos=None,
+        **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
+        and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
+        `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
+
+        Args:
+            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
+            - **attention_mask** -- List of indices specifying which timestamps should be attended to by the model when `audio` is not `None`.
+            When only `text` is specified, returns the token attention mask.
+            - **labels** -- List of token ids to be fed to a model. Returned when both `text` and `audio` are not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `audio` is `None`.
+        """
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+        output_kwargs = self._merge_kwargs(
+            Wav2Vec2BertProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+        if text is not None:
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, input_features=None, labels=None, **kwargs):
+        """
+        If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
+        If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if input_features is None and labels is None:
+            raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+
+__all__ = ["Wav2Vec2BertProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/__init__.py b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d0d9e2c3997c4b2865b5a47e7b64c39ed40b68
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wav2vec2_conformer import *
+    from .modeling_wav2vec2_conformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a840b9f824b6b7ade0e8f39cb72802ad6fa5dd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -0,0 +1,360 @@
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wav2Vec2Conformer model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2ConformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2ConformerModel`]. It is used to
+    instantiate an Wav2Vec2Conformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Conformer
+    [facebook/wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the Wav2Vec2Conformer model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`Wav2Vec2ConformerModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2ConformerModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for quantized feature encoder states.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ConformerForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2ConformerForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ConformerForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2Conformer Encoder. Can be very
+            useful for warm-starting Wav2Vec2Conformer for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative"`):
+            Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left
+            `None` no relative position embedding is applied.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
+        conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all convolutional layers in Conformer blocks.
+
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel
+
+    >>> # Initializing a Wav2Vec2Conformer facebook/wav2vec2-conformer-rel-pos-large style configuration
+    >>> configuration = Wav2Vec2ConformerConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-conformer-rel-pos-large style configuration
+    >>> model = Wav2Vec2ConformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wav2vec2-conformer"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        position_embeddings_type="relative",
+        rotary_embedding_base=10000,
+        max_source_positions=5000,
+        conv_depthwise_kernel_size=31,
+        conformer_conv_dropout=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.max_source_positions = max_source_positions
+        self.position_embeddings_type = position_embeddings_type
+        self.rotary_embedding_base = rotary_embedding_base
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # Conformer-block related
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.conformer_conv_dropout = conformer_conv_dropout
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["Wav2Vec2ConformerConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c435c6cd920afce3aff261cb14a8f0989cec1aa
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,309 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Wav2Vec2Conformer checkpoint."""
+
+import argparse
+import json
+import os
+
+import fairseq
+import torch
+from fairseq.data import Dictionary
+
+from transformers import (
+    Wav2Vec2ConformerConfig,
+    Wav2Vec2ConformerForCTC,
+    Wav2Vec2ConformerForPreTraining,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "self_attn.linear_k": "encoder.layers.*.self_attn.linear_k",
+    "self_attn.linear_v": "encoder.layers.*.self_attn.linear_v",
+    "self_attn.linear_q": "encoder.layers.*.self_attn.linear_q",
+    "self_attn.pos_bias_u": "encoder.layers.*.self_attn.pos_bias_u",
+    "self_attn.pos_bias_v": "encoder.layers.*.self_attn.pos_bias_v",
+    "self_attn.linear_out": "encoder.layers.*.self_attn.linear_out",
+    "self_attn.linear_pos": "encoder.layers.*.self_attn.linear_pos",
+    "self_attn.rotary_emb": "encoder.embed_positions",
+    "self_attn_layer_norm": "encoder.layers.*.self_attn_layer_norm",
+    "conv_module.pointwise_conv1": "encoder.layers.*.conv_module.pointwise_conv1",
+    "conv_module.pointwise_conv2": "encoder.layers.*.conv_module.pointwise_conv2",
+    "conv_module.depthwise_conv": "encoder.layers.*.conv_module.depthwise_conv",
+    "conv_module.batch_norm": "encoder.layers.*.conv_module.batch_norm",
+    "conv_module.layer_norm": "encoder.layers.*.conv_module.layer_norm",
+    "ffn1.w_1": "encoder.layers.*.ffn1.intermediate_dense",
+    "ffn1.w_2": "encoder.layers.*.ffn1.output_dense",
+    "ffn1.layer_norm": "encoder.layers.*.ffn1_layer_norm",
+    "ffn2.w_1": "encoder.layers.*.ffn2.intermediate_dense",
+    "ffn2.w_2": "encoder.layers.*.ffn2.output_dense",
+    "ffn2.layer_norm": "encoder.layers.*.ffn2_layer_norm",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    "final_proj": "project_hid",
+    "w2v_encoder.proj": "lm_head",
+    "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+    "lm_head",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    if hf_shape != value.shape:
+        raise ValueError(
+            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+            f" {value.shape} for {full_name}"
+        )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    elif weight_type == "running_mean":
+        hf_pointer.running_mean.data = value
+    elif weight_type == "running_var":
+        hf_pointer.running_var.data = value
+    elif weight_type == "num_batches_tracked":
+        hf_pointer.num_batches_tracked.data = value
+    elif weight_type == "inv_freq":
+        hf_pointer.inv_freq.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def recursively_load_weights(fairseq_model, hf_model, is_headless):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+
+    feature_extractor = hf_model.wav2vec2_conformer.feature_extractor
+
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                mapped_key = "wav2vec2_conformer." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "pos_bias_u" in name:
+                        weight_type = None
+                    elif "pos_bias_v" in name:
+                        weight_type = None
+                    elif "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        # TODO: don't match quantizer.weight_proj
+                        weight_type = "weight"
+                    elif "running_mean" in name:
+                        weight_type = "running_mean"
+                    elif "inv_freq" in name:
+                        weight_type = "inv_freq"
+                    elif "running_var" in name:
+                        weight_type = "running_var"
+                    elif "num_batches_tracked" in name:
+                        weight_type = "num_batches_tracked"
+                    else:
+                        weight_type = None
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+# Copied from transformers.models.wav2vec2.convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.load_conv_layer
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
+                raise ValueError(
+                    f"{full_name} has size {value.shape}, but"
+                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
+                )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_wav2vec2_conformer_checkpoint(
+    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
+):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    if config_path is not None:
+        config = Wav2Vec2ConformerConfig.from_pretrained(config_path, hidden_act="swish")
+    else:
+        config = Wav2Vec2ConformerConfig()
+
+    if "rope" in checkpoint_path:
+        config.position_embeddings_type = "rotary"
+
+    if is_finetuned:
+        if dict_path:
+            target_dict = Dictionary.load(dict_path)
+
+            # important change bos & pad token id since CTC symbol is <pad> and
+            # not <s> as in fairseq
+            config.bos_token_id = target_dict.pad_index
+            config.pad_token_id = target_dict.bos_index
+            config.eos_token_id = target_dict.eos_index
+            config.vocab_size = len(target_dict.symbols)
+            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
+            if not os.path.isdir(pytorch_dump_folder_path):
+                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
+                return
+            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+            vocab_dict = target_dict.indices
+
+            # fairseq has the <pad> and <s> switched
+            vocab_dict["<pad>"] = 0
+            vocab_dict["<s>"] = 1
+            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
+                json.dump(vocab_dict, vocab_handle)
+            tokenizer = Wav2Vec2CTCTokenizer(
+                vocab_path,
+                unk_token=target_dict.unk_word,
+                pad_token=target_dict.pad_word,
+                bos_token=target_dict.bos_word,
+                eos_token=target_dict.eos_word,
+                word_delimiter_token="|",
+                do_lower_case=False,
+            )
+            return_attention_mask = True if config.feat_extract_norm == "layer" else False
+            feature_extractor = Wav2Vec2FeatureExtractor(
+                feature_size=1,
+                sampling_rate=16000,
+                padding_value=0,
+                do_normalize=True,
+                return_attention_mask=return_attention_mask,
+            )
+            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+            processor.save_pretrained(pytorch_dump_folder_path)
+
+        hf_wav2vec = Wav2Vec2ConformerForCTC(config)
+    else:
+        hf_wav2vec = Wav2Vec2ConformerForPreTraining(config)
+
+    if is_finetuned:
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
+        )
+    else:
+        task_arg = argparse.Namespace(task="audio_pretraining")
+        task = fairseq.tasks.setup_task(task_arg)
+
+        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path], task=task)
+
+    model = model[0].eval()
+
+    recursively_load_weights(model, hf_wav2vec, not is_finetuned)
+
+    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
+    )
+    args = parser.parse_args()
+    convert_wav2vec2_conformer_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
+    )
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd94e44b6166c796c716ed79fe905c2870c7344c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -0,0 +1,2045 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_conformer.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_peft_available,
+    replace_return_docstrings,
+)
+from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
+
+
+@dataclass
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+class Wav2Vec2ConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2ConformerFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2ConformerFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class Wav2Vec2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+
+WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
+    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
+            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
+            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
+            that these models also yield slightly different results depending on whether `input_values` is padded or
+            not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2ConformerBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ConformerBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2ConformerBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
+
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
+        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2_conformer-base")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2_conformer-base")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+
+        quantized_features = quantized_features.to(self.project_q.weight.dtype)
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Wav2Vec2ConformerForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d101385fada5e60767826464f037e03d178651
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
@@ -0,0 +1,892 @@
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2AdapterLayer,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForPreTraining,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 2
+
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+
+
+@dataclass
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+class Wav2Vec2ConformerPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2ConformerFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class Wav2Vec2ConformerFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class Wav2Vec2ConformerFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://arxiv.org/abs/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2ConformerGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    pass
+
+
+class Wav2Vec2ConformerAdapter(Wav2Vec2Adapter):
+    pass
+
+
+class Wav2Vec2ConformerAdapterLayer(Wav2Vec2AdapterLayer):
+    pass
+
+
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+WAV2VEC2_CONFORMER_START_DOCSTRING = None  # will be automatically redefined
+
+WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
+            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
+            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
+            that these models also yield slightly different results depending on whether `input_values` is padded or
+            not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        Wav2Vec2ConformerPreTrainedModel.__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2ConformerBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ForPreTraining):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(self, **super_kwargs) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+    def tie_weights(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_base_model(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/__init__.py b/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbddfb4fe92d2c48a3c8f5ee7f8340650616e691
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_wav2vec2_phoneme import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
new file mode 100644
index 0000000000000000000000000000000000000000..574de200fa30e754005eba5c06708128cc8015d8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -0,0 +1,580 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Wav2Vec2Phoneme."""
+
+import json
+import os
+from dataclasses import dataclass
+from itertools import groupby
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import AddedToken
+from ...utils import (
+    ModelOutput,
+    is_flax_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+    to_py_obj,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+    if is_flax_available():
+        import jax.numpy as jnp  # noqa: F401
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+
+# Wav2Vec2Phoneme has no max input length
+
+
+ListOfDict = List[Dict[str, Union[int, str]]]
+
+
+@dataclass
+class Wav2Vec2PhonemeCTCTokenizerOutput(ModelOutput):
+    """
+    Output type of [` Wav2Vec2PhonemeCTCTokenizer`], with transcription.
+
+    Args:
+        text (list of `str` or `str`):
+            Decoded logits in text from. Usually the speech transcription.
+        char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
+            Offsets of the decoded characters. In combination with sampling rate and model downsampling rate char
+            offsets can be used to compute time stamps for each charater. Total logit score of the beam associated with
+            produced text.
+    """
+
+    text: Union[List[str], str]
+    char_offsets: Union[List[ListOfDict], ListOfDict] = None
+
+
+class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Wav2Vec2PhonemeCTC tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        do_phonemize (`bool`, *optional*, defaults to `True`):
+            Whether the tokenizer should phonetize the input or not. Only if a sequence of phonemes is passed to the
+            tokenizer, `do_phonemize` should be set to `False`.
+        phonemizer_lang (`str`, *optional*, defaults to `"en-us"`):
+            The language of the phoneme set to which the tokenizer should phonetize the input text to.
+        phonemizer_backend (`str`, *optional*. defaults to `"espeak"`):
+            The backend phonetization library that shall be used by the phonemizer library. Defaults to `espeak-ng`.
+            See the [phonemizer package](https://github.com/bootphon/phonemizer#readme). for more information.
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        phone_delimiter_token=" ",
+        word_delimiter_token=None,
+        do_phonemize=True,
+        phonemizer_lang="en-us",
+        phonemizer_backend="espeak",
+        **kwargs,
+    ):
+        self._word_delimiter_token = word_delimiter_token
+        self._phone_delimiter_token = phone_delimiter_token
+        self.do_phonemize = do_phonemize
+        self.phonemizer_lang = phonemizer_lang
+        self.phonemizer_backend = phonemizer_backend
+
+        if do_phonemize:
+            self.init_backend(self.phonemizer_lang)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            word_delimiter_token=word_delimiter_token,
+            phone_delimiter_token=phone_delimiter_token,
+            do_phonemize=do_phonemize,
+            phonemizer_lang=phonemizer_lang,
+            phonemizer_backend=phonemizer_backend,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> Dict:
+        vocab = dict(self.encoder.copy())
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        # Overwritten to never strip!
+        to_add = []
+        for token in new_tokens:
+            if isinstance(token, str):
+                to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalized=True, special=special_tokens))
+            else:
+                to_add.append(token)
+
+        return super()._add_tokens(to_add, special_tokens)
+
+    def init_backend(self, phonemizer_lang: str):
+        """
+        Initializes the backend.
+
+        Args:
+            phonemizer_lang (`str`): The language to be used.
+        """
+        requires_backends(self, "phonemizer")
+        from phonemizer.backend import BACKENDS
+
+        self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")
+
+    def prepare_for_tokenization(
+        self,
+        text: str,
+        is_split_into_words: bool = False,
+        phonemizer_lang: Optional[str] = None,
+        do_phonemize: Optional[bool] = None,
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            phonemizer_lang (`str`, *optional*):
+                The language of the phoneme set to which the tokenizer should phonetize the input text to.
+            do_phonemize (`bool`, *optional*):
+                Whether the tokenizer should phonetize the input text or not. Only if a sequence of phonemes is passed
+                to the tokenizer, `do_phonemize` should be set to `False`.
+
+
+        Returns:
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        if is_split_into_words:
+            text = " " + text
+
+        # set whether tokenizer should phonemize or not
+        if do_phonemize is not None:
+            self.do_phonemize = do_phonemize
+
+        # set the correct phonemizer language
+        if phonemizer_lang is not None:
+            self.phonemizer_lang = phonemizer_lang
+            self.init_backend(phonemizer_lang)
+
+        return (text, {})
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string into a sequence of tokens (string), using the tokenizer.
+        """
+
+        # make sure whitespace is stripped to prevent <unk>
+        text = text.strip()
+
+        # phonemize
+        if self.do_phonemize:
+            text = text.lower()
+
+            # create list of phonemes
+            text = self.phonemize(text, self.phonemizer_lang)
+
+        # make sure ' ' is between phonemes
+        tokens = text.split(" ")
+
+        tokens = list(filter(lambda p: p.strip() != "", tokens))
+        return tokens
+
+    def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
+        from phonemizer.separator import Separator
+
+        word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
+        if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
+            self.init_backend(phonemizer_lang)
+        else:
+            phonemizer_lang = self.phonemizer_lang
+
+        separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
+        phonemes = self.backend.phonemize(
+            [text],
+            separator=separator,
+        )
+        phonemes = phonemes[0].strip()
+
+        return phonemes
+
+    @property
+    def word_delimiter_token(self) -> str:
+        """
+        `str`: Word delimiter token. Log an error if used while not having been set.
+        """
+        if self._word_delimiter_token is None:
+            if self.verbose:
+                logger.error("Using word_delimiter_token, but it is not set yet.")
+            return None
+        return str(self._word_delimiter_token)
+
+    @property
+    def word_delimiter_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self._word_delimiter_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.word_delimiter_token)
+
+    @word_delimiter_token.setter
+    def word_delimiter_token(self, value):
+        self._word_delimiter_token = value
+
+    @word_delimiter_token_id.setter
+    def word_delimiter_token_id(self, value):
+        self._word_delimiter_token = self.convert_tokens_to_ids(value)
+
+    @property
+    def phone_delimiter_token(self) -> str:
+        """
+        `str`: Word delimiter token. Log an error if used while not having been set.
+        """
+        if self._phone_delimiter_token is None:
+            if self.verbose:
+                logger.error("Using phone_delimiter_token, but it is not set yet.")
+            return None
+        return str(self._phone_delimiter_token)
+
+    @property
+    def phone_delimiter_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the phone_delimiter_token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self._phone_delimiter_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.phone_delimiter_token)
+
+    @phone_delimiter_token.setter
+    def phone_delimiter_token(self, value):
+        self._phone_delimiter_token = value
+
+    @phone_delimiter_token_id.setter
+    def phone_delimiter_token_id(self, value):
+        self._phone_delimiter_token = self.convert_tokens_to_ids(value)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(
+        self,
+        tokens: List[str],
+        group_tokens: bool = True,
+        spaces_between_special_tokens: bool = False,
+        filter_word_delimiter_token: bool = True,
+        output_char_offsets: bool = False,
+    ) -> str:
+        """
+        Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
+        """
+        # group same tokens into non-repeating tokens in CTC style decoding
+        if group_tokens:
+            chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
+        else:
+            chars = tokens
+            char_repetitions = len(tokens) * [1]
+
+        # filter self.pad_token which is used as CTC-blank token
+        processed_chars = list(filter(lambda char: char != self.pad_token, chars))
+
+        # also filter self.word_delimiter_token if not not
+        if filter_word_delimiter_token and self.word_delimiter_token is not None:
+            processed_chars = list(filter(lambda token: token != self.word_delimiter_token, processed_chars))
+
+        # retrieve offsets
+        char_offsets = None
+        if output_char_offsets:
+            word_delimiter_token_for_offsets = (
+                self.word_delimiter_token if filter_word_delimiter_token is True else None
+            )
+            char_offsets = self._compute_offsets(
+                char_repetitions, chars, self.pad_token, word_delimiter_token=word_delimiter_token_for_offsets
+            )
+
+            if len(char_offsets) != len(processed_chars):
+                raise ValueError(
+                    f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
+                    " have to be of the same length, but are: `len(offsets)`: "
+                    f"{len(char_offsets)} and `len(processed_tokens)`: {len(processed_chars)}"
+                )
+
+            # set tokens to correct processed token
+            for i, char in enumerate(processed_chars):
+                char_offsets[i]["char"] = char
+
+        string = " ".join(processed_chars).strip()
+
+        return {"text": string, "char_offsets": char_offsets}
+
+    @staticmethod
+    def _compute_offsets(
+        char_repetitions: List[int], chars: List[str], ctc_token: int, word_delimiter_token: Optional[int] = None
+    ) -> List[Dict[str, Union[str, int]]]:
+        end_indices = np.asarray(char_repetitions).cumsum()
+        start_indices = np.concatenate(([0], end_indices[:-1]))
+
+        offsets = [
+            {"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
+        ]
+
+        # filter out CTC token
+        offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))
+
+        # filter out word delimiter token if necessary
+        if word_delimiter_token is not None:
+            offsets = list(filter(lambda offsets: offsets["char"] != word_delimiter_token, offsets))
+
+        return offsets
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        group_tokens: bool = True,
+        filter_word_delimiter_token: bool = True,
+        spaces_between_special_tokens: bool = False,
+        output_char_offsets: bool = False,
+    ) -> str:
+        """
+        special _decode function is needed for Wav2Vec2PhonemeTokenizer because added tokens should be treated exactly
+        the same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be
+        called on the whole token list and not individually on added tokens
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        result = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            result.append(token)
+
+        string_output = self.convert_tokens_to_string(
+            result,
+            group_tokens=group_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            filter_word_delimiter_token=filter_word_delimiter_token,
+            output_char_offsets=output_char_offsets,
+        )
+
+        text = string_output["text"]
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = self.clean_up_tokenization(text)
+
+        if output_char_offsets:
+            return Wav2Vec2PhonemeCTCTokenizerOutput(text=text, char_offsets=string_output["char_offsets"])
+        else:
+            return text
+
+    # overwritten from `tokenization_utils_base.py` because we need docs for `output_char_offsets` here
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_char_offsets: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces.
+            output_char_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output character offsets. Character offsets can be used in combination with the
+                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
+
+                <Tip>
+
+                Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
+                understand how to make use of `output_word_offsets`.
+                [`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works the same way with
+                phonemes.
+
+                </Tip>
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The decoded
+            sentence. Will be a [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]
+            when `output_char_offsets == True`.
+        """
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            output_char_offsets=output_char_offsets,
+            **kwargs,
+        )
+
+    # overwritten from `tokenization_utils_base.py` because tokenizer can output
+    # `ModelOutput` which should not be a list for batched output and because
+    # we need docs for `output_char_offsets` here
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_char_offsets: bool = False,
+        **kwargs,
+    ) -> List[str]:
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces.
+            output_char_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output character offsets. Character offsets can be used in combination with the
+                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.
+
+                <Tip>
+
+                Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
+                understand how to make use of `output_word_offsets`.
+                [`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works analogous with phonemes
+                and batched output.
+
+                </Tip>
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `List[str]` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The
+            decoded sentence. Will be a
+            [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`] when
+            `output_char_offsets == True`.
+        """
+        batch_decoded = [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                output_char_offsets=output_char_offsets,
+                **kwargs,
+            )
+            for seq in sequences
+        ]
+        if output_char_offsets:
+            # transform list of dicts to dict of lists
+            return Wav2Vec2PhonemeCTCTokenizerOutput({k: [d[k] for d in batch_decoded] for k in batch_decoded[0]})
+
+        return batch_decoded
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
+
+
+__all__ = ["Wav2Vec2PhonemeCTCTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/__init__.py b/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95fd10dab7092fb8571a332b1ecf2e9a334d8fc4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .processing_wav2vec2_with_lm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f569b4f625e70a419a5eb2f6f7f8f82ee78120a1
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -0,0 +1,658 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2
+"""
+
+import os
+import warnings
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from multiprocessing import Pool, get_context, get_start_method
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union
+
+import numpy as np
+
+from ...processing_utils import ProcessorMixin
+from ...utils import ModelOutput, logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    from pyctcdecode import BeamSearchDecoderCTC
+
+    from ...feature_extraction_utils import FeatureExtractionMixin
+    from ...tokenization_utils import PreTrainedTokenizerBase
+
+
+ListOfDict = List[Dict[str, Union[int, str]]]
+
+
+@dataclass
+class Wav2Vec2DecoderWithLMOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2DecoderWithLM`], with transcription.
+
+    Args:
+        text (list of `str` or `str`):
+            Decoded logits in text from. Usually the speech transcription.
+        logit_score (list of `float` or `float`):
+            Total logit score of the beams associated with produced text.
+        lm_score (list of `float`):
+            Fused lm_score of the beams associated with produced text.
+        word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
+            Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
+            can be used to compute time stamps for each word.
+    """
+
+    text: Union[List[List[str]], List[str], str]
+    logit_score: Union[List[List[float]], List[float], float] = None
+    lm_score: Union[List[List[float]], List[float], float] = None
+    word_offsets: Union[List[List[ListOfDict]], List[ListOfDict], ListOfDict] = None
+
+
+class Wav2Vec2ProcessorWithLM(ProcessorMixin):
+    r"""
+    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
+    with language model support into a single processor for language model boosted speech recognition decoding.
+
+    Args:
+        feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]):
+            An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`Wav2Vec2CTCTokenizer`]):
+            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
+    """
+
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Wav2Vec2CTCTokenizer"
+
+    def __init__(
+        self,
+        feature_extractor: "FeatureExtractionMixin",
+        tokenizer: "PreTrainedTokenizerBase",
+        decoder: "BeamSearchDecoderCTC",
+    ):
+        from pyctcdecode import BeamSearchDecoderCTC
+
+        super().__init__(feature_extractor, tokenizer)
+        if not isinstance(decoder, BeamSearchDecoderCTC):
+            raise TypeError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
+
+        if feature_extractor.__class__.__name__ not in ["Wav2Vec2FeatureExtractor", "SeamlessM4TFeatureExtractor"]:
+            raise ValueError(
+                f"`feature_extractor` has to be of type `Wav2Vec2FeatureExtractor` or `SeamlessM4TFeatureExtractor`, but is {type(feature_extractor)}"
+            )
+
+        # make sure that decoder's alphabet and tokenizer's vocab match in content
+        missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer)
+        if len(missing_decoder_tokens) > 0:
+            raise ValueError(
+                f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
+                "vocabulary, but not in the decoder's alphabet. "
+                f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
+            )
+
+        self.decoder = decoder
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def save_pretrained(self, save_directory):
+        super().save_pretrained(save_directory)
+        self.decoder.save_to_dir(save_directory)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate a [`Wav2Vec2ProcessorWithLM`] from a pretrained Wav2Vec2 processor.
+
+        <Tip>
+
+        This class method is simply calling the feature extractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], Wav2Vec2CTCTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], and
+        [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`].
+
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            **kwargs
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
+        """
+        requires_backends(cls, "pyctcdecode")
+        from pyctcdecode import BeamSearchDecoderCTC
+
+        feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path):
+            unigram_encoding = kwargs.get("unigram_encoding", "utf-8")
+            decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path, unigram_encoding)
+        else:
+            # BeamSearchDecoderCTC has no auto class
+            kwargs.pop("_from_auto", None)
+            # snapshot_download has no `trust_remote_code` flag
+            kwargs.pop("trust_remote_code", None)
+
+            # make sure that only relevant filenames are downloaded
+            language_model_filenames = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
+            alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+            allow_patterns = [language_model_filenames, alphabet_filename]
+
+            decoder = BeamSearchDecoderCTC.load_from_hf_hub(
+                pretrained_model_name_or_path, allow_patterns=allow_patterns, **kwargs
+            )
+
+        # set language model attributes
+        for attribute in ["alpha", "beta", "unk_score_offset", "score_boundary"]:
+            value = kwargs.pop(attribute, None)
+
+            if value is not None:
+                cls._set_language_model_attribute(decoder, attribute, value)
+
+        # make sure that decoder's alphabet and tokenizer's vocab match in content
+        missing_decoder_tokens = cls.get_missing_alphabet_tokens(decoder, tokenizer)
+        if len(missing_decoder_tokens) > 0:
+            raise ValueError(
+                f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
+                "vocabulary, but not in the decoder's alphabet. "
+                f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
+            )
+
+        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer, decoder=decoder)
+
+    @staticmethod
+    def _set_language_model_attribute(decoder: "BeamSearchDecoderCTC", attribute: str, value: float):
+        setattr(decoder.model_container[decoder._model_key], attribute, value)
+
+    @property
+    def language_model(self):
+        return self.decoder.model_container[self.decoder._model_key]
+
+    @staticmethod
+    def get_missing_alphabet_tokens(decoder, tokenizer):
+        from pyctcdecode.alphabet import BLANK_TOKEN_PTN, UNK_TOKEN, UNK_TOKEN_PTN
+
+        # we need to make sure that all of the tokenizer's except the special tokens
+        # are present in the decoder's alphabet. Retrieve missing alphabet token
+        # from decoder
+        tokenizer_vocab_list = list(tokenizer.get_vocab().keys())
+
+        # replace special tokens
+        for i, token in enumerate(tokenizer_vocab_list):
+            if BLANK_TOKEN_PTN.match(token):
+                tokenizer_vocab_list[i] = ""
+            if token == tokenizer.word_delimiter_token:
+                tokenizer_vocab_list[i] = " "
+            if UNK_TOKEN_PTN.match(token):
+                tokenizer_vocab_list[i] = UNK_TOKEN
+
+        # are any of the extra tokens no special tokenizer tokens?
+        missing_tokens = set(tokenizer_vocab_list) - set(decoder._alphabet.labels)
+
+        return missing_tokens
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
+        methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
+        for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def batch_decode(
+        self,
+        logits: np.ndarray,
+        pool: Optional[Pool] = None,
+        num_processes: Optional[int] = None,
+        beam_width: Optional[int] = None,
+        beam_prune_logp: Optional[float] = None,
+        token_min_logp: Optional[float] = None,
+        hotwords: Optional[Iterable[str]] = None,
+        hotword_weight: Optional[float] = None,
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        unk_score_offset: Optional[float] = None,
+        lm_score_boundary: Optional[bool] = None,
+        output_word_offsets: bool = False,
+        n_best: int = 1,
+    ):
+        """
+        Batch decode output logits to audio transcription with language model support.
+
+        <Tip>
+
+        This function makes use of Python's multiprocessing. Currently, multiprocessing is available only on Unix
+        systems (see this [issue](https://github.com/kensho-technologies/pyctcdecode/issues/65)).
+
+        If you are decoding multiple batches, consider creating a `Pool` and passing it to `batch_decode`. Otherwise,
+        `batch_decode` will be very slow since it will create a fresh `Pool` for each call. See usage example below.
+
+        </Tip>
+
+        Args:
+            logits (`np.ndarray`):
+                The logits output vector of the model representing the log probabilities for each token.
+            pool (`multiprocessing.Pool`, *optional*):
+                An optional user-managed pool. If not set, one will be automatically created and closed. The pool
+                should be instantiated *after* `Wav2Vec2ProcessorWithLM`. Otherwise, the LM won't be available to the
+                pool's sub-processes.
+
+                <Tip>
+
+                Currently, only pools created with a 'fork' context can be used. If a 'spawn' pool is passed, it will
+                be ignored and sequential decoding will be used instead.
+
+                </Tip>
+
+            num_processes (`int`, *optional*):
+                If `pool` is not set, number of processes on which the function should be parallelized over. Defaults
+                to the number of available CPUs.
+            beam_width (`int`, *optional*):
+                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
+            beam_prune_logp (`int`, *optional*):
+                Beams that are much worse than best beam will be pruned Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
+            token_min_logp (`int`, *optional*):
+                Tokens below this logp are skipped unless they are argmax of frame Defaults to pyctcdecode's
+                DEFAULT_MIN_TOKEN_LOGP.
+            hotwords (`List[str]`, *optional*):
+                List of words with extra importance, can be OOV for LM
+            hotword_weight (`int`, *optional*):
+                Weight factor for hotword importance Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
+            alpha (`float`, *optional*):
+                Weight for language model during shallow fusion
+            beta (`float`, *optional*):
+                Weight for length score adjustment of during scoring
+            unk_score_offset (`float`, *optional*):
+                Amount of log score offset for unknown tokens
+            lm_score_boundary (`bool`, *optional*):
+                Whether to have kenlm respect boundaries when scoring
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of lists of strings, `logit_score` will be a list of lists of floats, and `lm_score` will be a list of
+                lists of floats, where the length of the outer list will correspond to the batch size and the length of
+                the inner list will correspond to the number of returned hypotheses . The value should be >= 1.
+
+                <Tip>
+
+                Please take a look at the Example of [`~Wav2Vec2ProcessorWithLM.decode`] to better understand how to
+                make use of `output_word_offsets`. [`~Wav2Vec2ProcessorWithLM.batch_decode`] works the same way with
+                batched output.
+
+                </Tip>
+
+        Returns:
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`].
+
+        Example:
+            See [Decoding multiple audios](#decoding-multiple-audios).
+        """
+
+        from pyctcdecode.constants import (
+            DEFAULT_BEAM_WIDTH,
+            DEFAULT_HOTWORD_WEIGHT,
+            DEFAULT_MIN_TOKEN_LOGP,
+            DEFAULT_PRUNE_LOGP,
+        )
+
+        # set defaults
+        beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH
+        beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP
+        token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP
+        hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT
+
+        # reset params at every forward call. It's just a `set` method in pyctcdecode
+        self.decoder.reset_params(
+            alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary
+        )
+
+        # create multiprocessing pool and list numpy arrays
+        # filter out logits padding
+        logits_list = [array[(array != -100.0).all(axis=-1)] for array in logits]
+
+        # create a pool if necessary while also using it as a context manager to close itself
+        if pool is None:
+            # fork is safe to use only on Unix, see "Contexts and start methods" section on
+            # multiprocessing's docs (https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
+            default_context = get_start_method()
+
+            if default_context == "fork":
+                cm = pool = get_context().Pool(num_processes)
+            else:
+                logger.warning(
+                    "Parallel batch decoding is not currently supported in this platform. "
+                    "Falling back to sequential decoding."
+                )
+                cm = nullcontext()
+        else:
+            # pool is managed by the user, so we don't need to close it
+            cm = nullcontext()
+
+            if num_processes is not None:
+                logger.warning(
+                    "Parameter `num_process` was passed, but it will be ignored since `pool` was also specified."
+                )
+
+        # pyctcdecode
+        with cm:
+            decoded_beams = self.decoder.decode_beams_batch(
+                pool=pool,
+                logits_list=logits_list,
+                beam_width=beam_width,
+                beam_prune_logp=beam_prune_logp,
+                token_min_logp=token_min_logp,
+                hotwords=hotwords,
+                hotword_weight=hotword_weight,
+            )
+
+        # extract text and scores
+        batch_texts, logit_scores, lm_scores, word_offsets = [], [], [], []
+
+        for d in decoded_beams:
+            batch_texts.append([beam[0] for beam in d])
+            logit_scores.append([beam[-2] for beam in d])
+            lm_scores.append([beam[-1] for beam in d])
+
+            # word_offsets.append([{"word": t[0], "start_offset": t[1][0], "end_offset": t[1][1]} for t in d[0][1]])
+
+            word_offsets.append(
+                [
+                    [
+                        {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                        for word, (start_offset, end_offset) in beam[1]
+                    ]
+                    for beam in d
+                ]
+            )
+
+        word_offsets = word_offsets if output_word_offsets else None
+
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[0] for hyps in batch_texts],
+                logit_score=[hyps[0] for hyps in logit_scores],
+                lm_score=[hyps[0] for hyps in lm_scores],
+                word_offsets=[hyps[0] for hyps in word_offsets] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[:n_best] for hyps in batch_texts],
+                logit_score=[hyps[:n_best] for hyps in logit_scores],
+                lm_score=[hyps[:n_best] for hyps in lm_scores],
+                word_offsets=[hyps[:n_best] for hyps in word_offsets] if word_offsets is not None else None,
+            )
+
+    def decode(
+        self,
+        logits: np.ndarray,
+        beam_width: Optional[int] = None,
+        beam_prune_logp: Optional[float] = None,
+        token_min_logp: Optional[float] = None,
+        hotwords: Optional[Iterable[str]] = None,
+        hotword_weight: Optional[float] = None,
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        unk_score_offset: Optional[float] = None,
+        lm_score_boundary: Optional[bool] = None,
+        output_word_offsets: bool = False,
+        n_best: int = 1,
+    ):
+        """
+        Decode output logits to audio transcription with language model support.
+
+        Args:
+            logits (`np.ndarray`):
+                The logits output vector of the model representing the log probabilities for each token.
+            beam_width (`int`, *optional*):
+                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
+            beam_prune_logp (`int`, *optional*):
+                A threshold to prune beams with log-probs less than best_beam_logp + beam_prune_logp. The value should
+                be <= 0. Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
+            token_min_logp (`int`, *optional*):
+                Tokens with log-probs below token_min_logp are skipped unless they are have the maximum log-prob for an
+                utterance. Defaults to pyctcdecode's DEFAULT_MIN_TOKEN_LOGP.
+            hotwords (`List[str]`, *optional*):
+                List of words with extra importance which can be missing from the LM's vocabulary, e.g. ["huggingface"]
+            hotword_weight (`int`, *optional*):
+                Weight multiplier that boosts hotword scores. Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
+            alpha (`float`, *optional*):
+                Weight for language model during shallow fusion
+            beta (`float`, *optional*):
+                Weight for length score adjustment of during scoring
+            unk_score_offset (`float`, *optional*):
+                Amount of log score offset for unknown tokens
+            lm_score_boundary (`bool`, *optional*):
+                Whether to have kenlm respect boundaries when scoring
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of strings, `logit_score` will be a list of floats, and `lm_score` will be a list of floats, where the
+                length of these lists will correspond to the number of returned hypotheses. The value should be >= 1.
+
+                <Tip>
+
+                Please take a look at the example below to better understand how to make use of `output_word_offsets`.
+
+                </Tip>
+
+        Returns:
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`].
+
+        Example:
+
+        ```python
+        >>> # Let's see how to retrieve time steps for a model
+        >>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC
+        >>> from datasets import load_dataset
+        >>> import datasets
+        >>> import torch
+
+        >>> # import model, feature extractor, tokenizer
+        >>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+
+        >>> # load first sample of English common_voice
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True, trust_remote_code=True)
+        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+        >>> dataset_iter = iter(dataset)
+        >>> sample = next(dataset_iter)
+
+        >>> # forward sample through model to get greedily predicted transcription ids
+        >>> input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values
+        >>> with torch.no_grad():
+        ...     logits = model(input_values).logits[0].cpu().numpy()
+
+        >>> # retrieve word stamps (analogous commands for `output_char_offsets`)
+        >>> outputs = processor.decode(logits, output_word_offsets=True)
+        >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
+        >>> time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
+
+        >>> word_offsets = [
+        ...     {
+        ...         "word": d["word"],
+        ...         "start_time": round(d["start_offset"] * time_offset, 2),
+        ...         "end_time": round(d["end_offset"] * time_offset, 2),
+        ...     }
+        ...     for d in outputs.word_offsets
+        ... ]
+        >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer:
+        >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en
+        >>> word_offsets[:4]
+        [{'word': 'THE', 'start_time': 0.68, 'end_time': 0.78}, {'word': 'TRACK', 'start_time': 0.88, 'end_time': 1.1}, {'word': 'APPEARS', 'start_time': 1.18, 'end_time': 1.66}, {'word': 'ON', 'start_time': 1.86, 'end_time': 1.92}]
+        ```"""
+
+        from pyctcdecode.constants import (
+            DEFAULT_BEAM_WIDTH,
+            DEFAULT_HOTWORD_WEIGHT,
+            DEFAULT_MIN_TOKEN_LOGP,
+            DEFAULT_PRUNE_LOGP,
+        )
+
+        # set defaults
+        beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH
+        beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP
+        token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP
+        hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT
+
+        # reset params at every forward call. It's just a `set` method in pyctcdecode
+        self.decoder.reset_params(
+            alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary
+        )
+
+        # pyctcdecode
+        decoded_beams = self.decoder.decode_beams(
+            logits,
+            beam_width=beam_width,
+            beam_prune_logp=beam_prune_logp,
+            token_min_logp=token_min_logp,
+            hotwords=hotwords,
+            hotword_weight=hotword_weight,
+        )
+
+        word_offsets = None
+        if output_word_offsets:
+            word_offsets = [
+                [
+                    {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                    for word, (start_offset, end_offset) in beam[2]
+                ]
+                for beam in decoded_beams
+            ]
+        logit_scores = [beam[-2] for beam in decoded_beams]
+
+        lm_scores = [beam[-1] for beam in decoded_beams]
+
+        hypotheses = [beam[0] for beam in decoded_beams]
+
+        if n_best > len(decoded_beams):
+            logger.info(
+                "N-best size is larger than the number of generated hypotheses, all hypotheses will be returned."
+            )
+
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[0],
+                logit_score=logit_scores[0],
+                lm_score=lm_scores[0],
+                word_offsets=word_offsets[0] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[:n_best],
+                logit_score=logit_scores[:n_best],
+                lm_score=lm_scores[:n_best],
+                word_offsets=word_offsets[:n_best] if word_offsets is not None else None,
+            )
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
+        Wav2Vec2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["Wav2Vec2ProcessorWithLM"]
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/__init__.py b/docs/transformers/build/lib/transformers/models/wavlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d230bf3f0924b436e1b9cd9ea509a989cf5bdb8b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wavlm import *
+    from .modeling_wavlm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/configuration_wavlm.py b/docs/transformers/build/lib/transformers/models/wavlm/configuration_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..63cd44e12b0f047fd77c1c6289037048a36c9d92
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/configuration_wavlm.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors, Microsoft Research, and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""WavLM model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class WavLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to instantiate an WavLM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the WavLM
+    [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the WavLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`WavLMModel`]. Vocabulary size of the model. Defines the different tokens
+            that can be represented by the *inputs_ids* passed to the forward method of [`WavLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`WavLMForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
+            along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
+            along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`WavLMForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`WavLMForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WavLMForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]` or `List[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
+            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+
+    ```
+
+    Example:
+
+    ```python
+    >>> from transformers import WavLMConfig, WavLMModel
+
+    >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
+    >>> configuration = WavLMConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wavlm-base-960h style configuration
+    >>> model = WavLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wavlm"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        num_buckets=320,
+        max_bucket_distance=800,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="mean",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        num_ctc_classes=80,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_buckets = num_buckets
+        self.max_bucket_distance = max_bucket_distance
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.num_ctc_classes = num_ctc_classes
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["WavLMConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d4853bade1f2c4ed06205c12478fcb0929a0c2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert WavLM checkpoint."""
+
+import argparse
+
+import torch
+
+# Step 1. clone https://github.com/microsoft/unilm
+# Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
+# Step 3. cd unilm
+# Step 4. ln -s $(realpath wavlm/modules.py) ./  # create simlink
+# import classes
+from unilm.wavlm.WavLM import WavLM as WavLMOrig
+from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
+
+from transformers import WavLMConfig, WavLMModel, logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+MAPPING = {
+    "post_extract_proj": "feature_projection.projection",
+    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
+    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
+    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
+    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
+    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
+    "self_attn.grep_linear": "encoder.layers.*.attention.gru_rel_pos_linear",
+    "self_attn.relative_attention_bias": "encoder.layers.*.attention.rel_attn_embed",
+    "self_attn.grep_a": "encoder.layers.*.attention.gru_rel_pos_const",
+    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
+    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
+    "fc2": "encoder.layers.*.feed_forward.output_dense",
+    "final_layer_norm": "encoder.layers.*.final_layer_norm",
+    "encoder.layer_norm": "encoder.layer_norm",
+    "w2v_model.layer_norm": "feature_projection.layer_norm",
+    "quantizer.weight_proj": "quantizer.weight_proj",
+    "quantizer.vars": "quantizer.codevectors",
+    "project_q": "project_q",
+    "final_proj": "project_hid",
+    "w2v_encoder.proj": "ctc_proj",
+    "mask_emb": "masked_spec_embed",
+}
+TOP_LEVEL_KEYS = [
+    "ctc_proj",
+    "quantizer.weight_proj",
+    "quantizer.codevectors",
+    "project_q",
+    "project_hid",
+]
+
+
+def set_recursively(hf_pointer, key, value, full_name, weight_type):
+    for attribute in key.split("."):
+        hf_pointer = getattr(hf_pointer, attribute)
+
+    if weight_type is not None:
+        hf_shape = getattr(hf_pointer, weight_type).shape
+    else:
+        hf_shape = hf_pointer.shape
+
+    assert hf_shape == value.shape, (
+        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
+        f" {value.shape} for {full_name}"
+    )
+
+    if weight_type == "weight":
+        hf_pointer.weight.data = value
+    elif weight_type == "weight_g":
+        hf_pointer.weight_g.data = value
+    elif weight_type == "weight_v":
+        hf_pointer.weight_v.data = value
+    elif weight_type == "bias":
+        hf_pointer.bias.data = value
+    else:
+        hf_pointer.data = value
+
+    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
+
+
+def recursively_load_weights(fairseq_model, hf_model):
+    unused_weights = []
+    fairseq_dict = fairseq_model.state_dict()
+
+    feature_extractor = hf_model.feature_extractor
+
+    for name, value in fairseq_dict.items():
+        is_used = False
+        if "conv_layers" in name:
+            load_conv_layer(
+                name,
+                value,
+                feature_extractor,
+                unused_weights,
+                hf_model.config.feat_extract_norm == "group",
+            )
+            is_used = True
+        else:
+            for key, mapped_key in MAPPING.items():
+                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
+                    is_used = True
+                    if "*" in mapped_key:
+                        layer_index = name.split(key)[0].split(".")[-2]
+                        mapped_key = mapped_key.replace("*", layer_index)
+                    if "weight_g" in name:
+                        weight_type = "weight_g"
+                    elif "weight_v" in name:
+                        weight_type = "weight_v"
+                    elif "bias" in name and "relative_attention_bias" not in name:
+                        weight_type = "bias"
+                    elif "weight" in name:
+                        # TODO: don't match quantizer.weight_proj
+                        weight_type = "weight"
+                    else:
+                        weight_type = None
+
+                    set_recursively(hf_model, mapped_key, value, name, weight_type)
+                continue
+        if not is_used:
+            unused_weights.append(name)
+
+    logger.warning(f"Unused weights: {unused_weights}")
+
+
+def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
+    name = full_name.split("conv_layers.")[-1]
+    items = name.split(".")
+    layer_id = int(items[0])
+    type_id = int(items[1])
+
+    if type_id == 0:
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.bias.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].conv.weight.data = value
+            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
+    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
+        if "bias" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
+                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
+                " found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+        elif "weight" in name:
+            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
+                f"{full_name} has size {value.shape}, but"
+                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
+            )
+            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
+            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
+    else:
+        unused_weights.append(full_name)
+
+
+@torch.no_grad()
+def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
+    # load the pre-trained checkpoints
+    checkpoint = torch.load(checkpoint_path, weights_only=True)
+    cfg = WavLMConfigOrig(checkpoint["cfg"])
+    model = WavLMOrig(cfg)
+    model.load_state_dict(checkpoint["model"])
+    model.eval()
+
+    if config_path is not None:
+        config = WavLMConfig.from_pretrained(config_path)
+    else:
+        config = WavLMConfig()
+
+    hf_wavlm = WavLMModel(config)
+
+    recursively_load_weights(model, hf_wavlm)
+
+    hf_wavlm.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    args = parser.parse_args()
+    convert_wavlm_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8c4c33767979c5931a0b262ae54e47eda9fb83d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/convert_wavlm_original_s3prl_checkpoint_to_pytorch.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert Hubert checkpoint."""
+
+import argparse
+
+import torch
+
+from transformers import (
+    Wav2Vec2FeatureExtractor,
+    WavLMConfig,
+    WavLMForAudioFrameClassification,
+    WavLMForSequenceClassification,
+    WavLMForXVector,
+    logging,
+)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def convert_classification(base_model_name, hf_config, downstream_dict):
+    model = WavLMForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
+    model.projector.weight.data = downstream_dict["projector.weight"]
+    model.projector.bias.data = downstream_dict["projector.bias"]
+    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
+    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
+    return model
+
+
+def convert_diarization(base_model_name, hf_config, downstream_dict):
+    model = WavLMForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
+    model.classifier.weight.data = downstream_dict["model.linear.weight"]
+    model.classifier.bias.data = downstream_dict["model.linear.bias"]
+    return model
+
+
+def convert_xvector(base_model_name, hf_config, downstream_dict):
+    model = WavLMForXVector.from_pretrained(base_model_name, config=hf_config)
+    model.projector.weight.data = downstream_dict["connector.weight"]
+    model.projector.bias.data = downstream_dict["connector.bias"]
+    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
+        model.tdnn[i].kernel.weight.data = downstream_dict[
+            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
+        ]
+        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]
+
+    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
+    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
+    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
+    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
+    model.objective.weight.data = downstream_dict["objective.W"]
+    return model
+
+
+@torch.no_grad()
+def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
+    """
+    Copy/paste/tweak model's weights to transformers design.
+    """
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+
+    downstream_dict = checkpoint["Downstream"]
+
+    hf_config = WavLMConfig.from_pretrained(config_path)
+    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+        base_model_name, return_attention_mask=True, do_normalize=False
+    )
+
+    arch = hf_config.architectures[0]
+    if arch.endswith("ForSequenceClassification"):
+        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
+    elif arch.endswith("ForAudioFrameClassification"):
+        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
+    elif arch.endswith("ForXVector"):
+        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
+    else:
+        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")
+
+    if hf_config.use_weighted_layer_sum:
+        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]
+
+    hf_feature_extractor.save_pretrained(model_dump_path)
+    hf_model.save_pretrained(model_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
+    )
+    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
+    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
+    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
+    args = parser.parse_args()
+    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/modeling_wavlm.py b/docs/transformers/build/lib/transformers/models/wavlm/modeling_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3c09d1a70f7cf5d732839b0ca403c58aad1275
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/modeling_wavlm.py
@@ -0,0 +1,1825 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wavlm/modular_wavlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wavlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_peft_available,
+    logging,
+)
+from .configuration_wavlm import WavLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
+
+_CONFIG_FOR_DOC = "WavLMConfig"
+
+
+class WavLMSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class WavLMPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class WavLMFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class WavLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        num_buckets: int = 320,
+        max_distance: int = 800,
+        has_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+
+        if has_relative_position_bias:
+            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        index=0,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Attention layer with relative attention"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # first pass of attention layer creates position bias
+        if position_bias is None:
+            position_bias = self.compute_bias(tgt_len, tgt_len)
+            position_bias = (
+                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+            )
+
+        # Compute relative position bias:
+        # 1) get reshape hidden_states
+        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
+        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
+
+        # 2) project hidden states
+        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
+        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
+
+        # 3) compute gate for position bias from projected hidden states
+        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
+        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+
+        # 4) apply gate to position bias to compute gated position_bias
+        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
+        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
+
+        attn_output, attn_weights = self.torch_multi_head_self_attention(
+            hidden_states, attention_mask, gated_position_bias, output_attentions
+        )
+
+        return attn_output, attn_weights, position_bias
+
+    def torch_multi_head_self_attention(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
+        gated_position_bias: torch.FloatTensor,
+        output_attentions: bool,
+    ) -> (torch.FloatTensor, torch.FloatTensor):
+        """simple wrapper around torch's multi_head_attention_forward function"""
+        # self-attention assumes q = k = v
+        query = key = value = hidden_states.transpose(0, 1)
+        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+
+        # disable bias and add_zero_attn
+        bias_k = bias_v = None
+        add_zero_attn = False
+
+        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
+        # so no problem with backwards compatibility
+        attn_output, attn_weights = F.multi_head_attention_forward(
+            query,
+            key,
+            value,
+            self.embed_dim,
+            self.num_heads,
+            torch.empty([0]),
+            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            self.dropout,
+            self.out_proj.weight,
+            self.out_proj.bias,
+            self.training,
+            key_padding_mask,
+            output_attentions,
+            gated_position_bias,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )
+
+        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
+        attn_output = attn_output.transpose(0, 1)
+
+        if attn_weights is not None:
+            # IMPORTANT: Attention weights are averaged weights
+            # here which should not be the case. This is an open issue
+            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
+            attn_weights = attn_weights[:, None].broadcast_to(
+                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
+            )
+
+        return attn_output, attn_weights
+
+    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
+        num_buckets = self.num_buckets // 2
+
+        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
+        relative_positions = torch.abs(relative_positions)
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
+        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
+        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
+        return relative_buckets
+
+
+class WavLMFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        position_bias=position_bias,
+                        output_attentions=output_attentions,
+                        index=i,
+                    )
+
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class WavLMEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        output_attentions=output_attentions,
+                        position_bias=position_bias,
+                    )
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class WavLMGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible"
+                f" by `config.num_codevector_groups` {self.num_groups} "
+                "for concatenation."
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class WavLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+class WavLMNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
+                WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    conv_layer.__call__,
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class WavLMAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class WavLMAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(WavLMAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+
+WAVLM_START_DOCSTRING = r"""
+    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
+    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
+    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
+    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAVLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMModel(WavLMPreTrainedModel):
+    def __init__(self, config: WavLMConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = WavLMFeatureEncoder(config)
+        self.feature_projection = WavLMFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = WavLMEncoderStableLayerNorm(config)
+        else:
+            self.encoder = WavLMEncoder(config)
+
+        self.adapter = WavLMAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=WavLMBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, WavLMBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return WavLMBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 12.51
+
+
+@add_start_docstrings(
+    """WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForCTC(WavLMPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        super().__init__(config)
+
+        self.wavlm = WavLMModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for WavLM so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, WavLM never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForSequenceClassification(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)"
+            )
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)"
+            )
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForXVector(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "WavLMForAudioFrameClassification",
+    "WavLMForCTC",
+    "WavLMForSequenceClassification",
+    "WavLMForXVector",
+    "WavLMModel",
+    "WavLMPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/wavlm/modular_wavlm.py b/docs/transformers/build/lib/transformers/models/wavlm/modular_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae9170fec56f52933453001ce36530d3e717943
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/wavlm/modular_wavlm.py
@@ -0,0 +1,758 @@
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+    Wav2Vec2PreTrainedModel,
+)
+from .configuration_wavlm import WavLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WavLMConfig"
+
+_CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]
+
+_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
+_CTC_EXPECTED_LOSS = 12.51
+
+_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
+_FRAME_EXPECTED_OUTPUT = [0, 0]
+
+_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
+_XVECTOR_EXPECTED_OUTPUT = 0.97
+
+
+class WavLMPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class WavLMFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class WavLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        num_buckets: int = 320,
+        max_distance: int = 800,
+        has_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+
+        if has_relative_position_bias:
+            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        index=0,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Attention layer with relative attention"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # first pass of attention layer creates position bias
+        if position_bias is None:
+            position_bias = self.compute_bias(tgt_len, tgt_len)
+            position_bias = (
+                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+            )
+
+        # Compute relative position bias:
+        # 1) get reshape hidden_states
+        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
+        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
+
+        # 2) project hidden states
+        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
+        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
+
+        # 3) compute gate for position bias from projected hidden states
+        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
+        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+
+        # 4) apply gate to position bias to compute gated position_bias
+        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
+        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
+
+        attn_output, attn_weights = self.torch_multi_head_self_attention(
+            hidden_states, attention_mask, gated_position_bias, output_attentions
+        )
+
+        return attn_output, attn_weights, position_bias
+
+    def torch_multi_head_self_attention(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
+        gated_position_bias: torch.FloatTensor,
+        output_attentions: bool,
+    ) -> (torch.FloatTensor, torch.FloatTensor):
+        """simple wrapper around torch's multi_head_attention_forward function"""
+        # self-attention assumes q = k = v
+        query = key = value = hidden_states.transpose(0, 1)
+        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+
+        # disable bias and add_zero_attn
+        bias_k = bias_v = None
+        add_zero_attn = False
+
+        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
+        # so no problem with backwards compatibility
+        attn_output, attn_weights = F.multi_head_attention_forward(
+            query,
+            key,
+            value,
+            self.embed_dim,
+            self.num_heads,
+            torch.empty([0]),
+            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            self.dropout,
+            self.out_proj.weight,
+            self.out_proj.bias,
+            self.training,
+            key_padding_mask,
+            output_attentions,
+            gated_position_bias,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )
+
+        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
+        attn_output = attn_output.transpose(0, 1)
+
+        if attn_weights is not None:
+            # IMPORTANT: Attention weights are averaged weights
+            # here which should not be the case. This is an open issue
+            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
+            attn_weights = attn_weights[:, None].broadcast_to(
+                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
+            )
+
+        return attn_output, attn_weights
+
+    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
+        num_buckets = self.num_buckets // 2
+
+        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
+        relative_positions = torch.abs(relative_positions)
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
+        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
+        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
+        return relative_buckets
+
+
+class WavLMFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class WavLMEncoderLayer(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(nn.Module):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        position_bias=position_bias,
+                        output_attentions=output_attentions,
+                        index=i,
+                    )
+
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class WavLMEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                        position_bias,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        output_attentions=output_attentions,
+                        position_bias=position_bias,
+                    )
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class WavLMGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible"
+                f" by `config.num_codevector_groups` {self.num_groups} "
+                "for concatenation."
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class WavLMPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_adapters(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def init_adapter_layers(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def load_adapter(self):
+        raise AttributeError("Not needed for WavLM")
+
+
+WAVLM_START_DOCSTRING = r"""
+    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
+    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
+    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
+    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WAVLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            <Tip warning={true}>
+
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, `attention_mask` should
+            **not** be passed to avoid degraded performance when doing batched inference. For such models
+            `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware that these
+            models also yield slightly different results depending on whether `input_values` is padded or not.
+
+            </Tip>
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@add_start_docstrings(
+    "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMModel(Wav2Vec2Model):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=WavLMBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(self, **super_kwargs):
+        return super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForCTC(Wav2Vec2ForCTC):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_FRAME_CLASS_CHECKPOINT,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_FRAME_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+@add_start_docstrings(
+    """
+    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAVLM_START_DOCSTRING,
+)
+class WavLMForXVector(Wav2Vec2ForXVector):
+    pass
+
+    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_XVECTOR_CHECKPOINT,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_XVECTOR_EXPECTED_OUTPUT,
+    )
+    def forward(self, **super_kwargs):
+        super().forward(**super_kwargs)
+
+
+__all__ = [
+    "WavLMForAudioFrameClassification",
+    "WavLMForCTC",
+    "WavLMForSequenceClassification",
+    "WavLMForXVector",
+    "WavLMModel",
+    "WavLMPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/__init__.py b/docs/transformers/build/lib/transformers/models/whisper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4956c5fbb257a20a3cb74a14cb48ea9005fdc73
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_whisper import *
+    from .feature_extraction_whisper import *
+    from .modeling_flax_whisper import *
+    from .modeling_tf_whisper import *
+    from .modeling_whisper import *
+    from .processing_whisper import *
+    from .tokenization_whisper import *
+    from .tokenization_whisper_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/whisper/configuration_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/configuration_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f7991215a85166b0a026a0ae3d90e5987d93be2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/configuration_whisper.py
@@ -0,0 +1,347 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Whisper model configuration"""
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...feature_extraction_utils import FeatureExtractionMixin
+    from ...tokenization_utils_base import PreTrainedTokenizerBase
+    from ...utils import TensorType
+
+logger = logging.get_logger(__name__)
+
+
+# fmt: off
+NON_SPEECH_TOKENS = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
+    705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
+    1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
+    4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
+    11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
+    17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
+    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
+]
+NON_SPEECH_TOKENS_MULTI = [
+    1, 2, 7, 8, 9, 10, 14, 25,
+    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
+    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
+    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
+    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
+    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
+    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
+    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
+    42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
+]
+# fmt: on
+
+
+class WhisperConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
+    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Whisper
+    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51865):
+            Vocabulary size of the Whisper model. Defines the number of different tokens that can be represented by the
+            `decoder_input_ids` passed when calling [`WhisperModel`]
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Should correspond to the value used in the
+            `WhisperProcessor` class.
+        encoder_layers (`int`, *optional*, defaults to 4):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 4):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 1536):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_start_token_id (`int`, *optional*, defaults to 50257):
+            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
+            are provided to the `generate` function. It is used to guide the model`s generation process depending on
+            the task.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        d_model (`int`, *optional*, defaults to 384):
+            Dimensionality of the layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to False):
+            Scale embeddings by diving by sqrt(d_model).
+        max_source_positions (`int`, *optional*, defaults to 1500):
+            The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
+        max_target_positions (`int`, *optional*, defaults to 448):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        pad_token_id (`int`, *optional*, defaults to 50256):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Begin of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            End of stream token id.
+        suppress_tokens (`List[int]`, *optional*):
+            A list containing the non-speech tokens that will be used by the logit processor in the `generate`
+            function. NON_SPEECH_TOKENS and NON_SPEECH_TOKENS_MULTI each correspond to the `english-only` and the
+            `multilingual` model.
+        begin_suppress_tokens (`List[int]`, *optional*, defaults to `[220,50256]`):
+            A list containing tokens that will be supressed at the beginning of the sampling process. Initialized as
+            the token for `" "` (`blank_token_id`) and the `eos_token_id`
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification. Only relevant when using an
+            instance of [`WhisperForAudioClassification`].
+        apply_spec_augment (`bool`, *optional*, defaults to `False`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates `mask_time_prob*len(time_axis)/mask_time_length` independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment == True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        median_filter_width (`int`, *optional*, defaults to 7):
+            Width of the median filter used to smoothen to cross-attention outputs when computing token timestamps.
+            Should be an odd number.
+
+    Example:
+
+    ```python
+    >>> from transformers import WhisperConfig, WhisperModel
+
+    >>> # Initializing a Whisper tiny style configuration
+    >>> configuration = WhisperConfig()
+
+    >>> # Initializing a model (with random weights) from the tiny style configuration
+    >>> model = WhisperModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "whisper"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_key_value_heads": "encoder_attention_heads",
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+    }
+
+    def __init__(
+        self,
+        vocab_size=51865,
+        num_mel_bins=80,
+        encoder_layers=4,
+        encoder_attention_heads=6,
+        decoder_layers=4,
+        decoder_attention_heads=6,
+        decoder_ffn_dim=1536,
+        encoder_ffn_dim=1536,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        decoder_start_token_id=50257,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=384,
+        dropout=0.0,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        scale_embedding=False,
+        max_source_positions=1500,
+        max_target_positions=448,
+        pad_token_id=50256,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        suppress_tokens=None,
+        begin_suppress_tokens=[220, 50256],
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        apply_spec_augment=False,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        median_filter_width=7,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_source_positions = max_source_positions
+        self.max_target_positions = max_target_positions
+
+        # Audio Classification-specific parameters. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        self.median_filter_width = median_filter_width
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            suppress_tokens=suppress_tokens,
+            begin_suppress_tokens=begin_suppress_tokens,
+            **kwargs,
+        )
+
+
+class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
+            ]
+        )
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
+    ) -> Mapping[str, Any]:
+        dummy_inputs = OrderedDict()
+        encoder_inputs = OnnxConfig.generate_dummy_inputs(
+            self,
+            preprocessor=preprocessor.feature_extractor,
+            batch_size=batch_size,
+            framework=framework,
+            sampling_rate=sampling_rate,
+            time_duration=time_duration,
+            frequency=frequency,
+        )
+        encoder_sequence_length = encoder_inputs["input_features"].shape[2]
+        seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
+
+        decoder_inputs = super().generate_dummy_inputs(
+            preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
+        )
+
+        dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
+        dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")
+
+        if "past_key_values" in decoder_inputs:
+            dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")
+
+        return dummy_inputs
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-3
+
+
+__all__ = ["WhisperConfig", "WhisperOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/convert_openai_to_hf.py b/docs/transformers/build/lib/transformers/models/whisper/convert_openai_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..343fb5513b5176ac6221b9a1f84c503b5e064eec
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/convert_openai_to_hf.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python
+"""Converts a Whisper model in OpenAI format to Hugging Face format."""
+# Copyright 2022 The HuggingFace Inc. team and the OpenAI team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import io
+import json
+import os
+import tempfile
+import urllib
+import warnings
+from typing import Any, List, Optional, Tuple
+
+import torch
+from huggingface_hub.utils import insecure_hashlib
+from torch import nn
+from tqdm import tqdm
+
+from transformers import (
+    GenerationConfig,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    WhisperTokenizer,
+    WhisperTokenizerFast,
+)
+from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode
+from transformers.utils.import_utils import _is_package_available
+
+
+_MODELS = {
+    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
+    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
+    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
+    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
+    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
+    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
+    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
+    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
+    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
+    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
+    "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
+}
+
+
+_TOKENIZERS = {
+    "multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
+    "english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
+}
+
+
+def _get_generation_config(
+    is_multilingual: bool,
+    num_languages: int = 100,
+    openai_version: Optional[str] = None,
+) -> GenerationConfig:
+    """
+    Loads the appropriate generation config from HF repo
+    """
+    if openai_version is not None:
+        repo = f"openai/whisper-{openai_version}"
+    elif not is_multilingual:
+        repo = "openai/whisper-medium.en"
+    elif num_languages < 100:
+        repo = "openai/whisper-large-v2"
+    else:
+        repo = "openai/whisper-large-v3"
+
+    gen_cfg = GenerationConfig.from_pretrained(repo)
+    if openai_version is None:
+        gen_cfg.alignment_heads = None
+        warnings.warn(
+            "Alignment heads have not been included in the generation config, since they are available "
+            "only for the original OpenAI checkpoints."
+            "If you want to use word-level timestamps with a custom version of Whisper,"
+            "see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
+            "for the example of how to produce word-level timestamps manually."
+        )
+
+    return gen_cfg
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = ["layers", "blocks"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+WHISPER_MAPPING = {
+    "blocks": "layers",
+    "mlp.0": "fc1",
+    "mlp.2": "fc2",
+    "mlp_ln": "final_layer_norm",
+    ".attn.query": ".self_attn.q_proj",
+    ".attn.key": ".self_attn.k_proj",
+    ".attn.value": ".self_attn.v_proj",
+    ".attn_ln": ".self_attn_layer_norm",
+    ".attn.out": ".self_attn.out_proj",
+    ".cross_attn.query": ".encoder_attn.q_proj",
+    ".cross_attn.key": ".encoder_attn.k_proj",
+    ".cross_attn.value": ".encoder_attn.v_proj",
+    ".cross_attn_ln": ".encoder_attn_layer_norm",
+    ".cross_attn.out": ".encoder_attn.out_proj",
+    "decoder.ln.": "decoder.layer_norm.",
+    "encoder.ln.": "encoder.layer_norm.",
+    "token_embedding": "embed_tokens",
+    "encoder.positional_embedding": "encoder.embed_positions.weight",
+    "decoder.positional_embedding": "decoder.embed_positions.weight",
+    "ln_post": "layer_norm",
+}
+
+
+def rename_keys(s_dict):
+    keys = list(s_dict.keys())
+    for key in keys:
+        new_key = key
+        for k, v in WHISPER_MAPPING.items():
+            if k in key:
+                new_key = new_key.replace(k, v)
+
+        print(f"{key} -> {new_key}")
+
+        s_dict[new_key] = s_dict.pop(key)
+    return s_dict
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+def _download(url: str, root: str) -> Any:
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        model_bytes = open(download_target, "rb").read()
+        if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
+            return torch.load(io.BytesIO(model_bytes), weights_only=True)
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(
+            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    model_bytes = open(download_target, "rb").read()
+    if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
+        )
+
+    return torch.load(io.BytesIO(model_bytes), weights_only=True)
+
+
+def convert_openai_whisper_to_tfms(
+    checkpoint_path, pytorch_dump_folder_path
+) -> Tuple[WhisperForConditionalGeneration, bool, int]:
+    if ".pt" not in checkpoint_path:
+        root = os.path.dirname(pytorch_dump_folder_path) or "."
+        original_checkpoint = _download(_MODELS[checkpoint_path], root)
+        openai_version = checkpoint_path
+    else:
+        original_checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+        openai_version = None
+
+    dimensions = original_checkpoint["dims"]
+    state_dict = original_checkpoint["model_state_dict"]
+    proj_out_weights = state_dict["decoder.token_embedding.weight"]
+    remove_ignore_keys_(state_dict)
+    rename_keys(state_dict)
+    tie_embeds = True
+    ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]
+
+    # a hacky way to properly set up the bos/eos/pad token ids in the model
+    endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256
+
+    config = WhisperConfig(
+        vocab_size=dimensions["n_vocab"],
+        encoder_ffn_dim=ffn_dim,
+        decoder_ffn_dim=ffn_dim,
+        num_mel_bins=dimensions["n_mels"],
+        d_model=dimensions["n_audio_state"],
+        max_target_positions=dimensions["n_text_ctx"],
+        encoder_layers=dimensions["n_audio_layer"],
+        encoder_attention_heads=dimensions["n_audio_head"],
+        decoder_layers=dimensions["n_text_layer"],
+        decoder_attention_heads=dimensions["n_text_head"],
+        max_source_positions=dimensions["n_audio_ctx"],
+        eos_token_id=endoftext_id,
+        bos_token_id=endoftext_id,
+        pad_token_id=endoftext_id,
+        decoder_start_token_id=endoftext_id + 1,
+    )
+
+    model = WhisperForConditionalGeneration(config)
+    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
+    if len(missing) > 0 and not set(missing) <= {
+        "encoder.embed_positions.weights",
+        "decoder.embed_positions.weights",
+    }:
+        raise ValueError(
+            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
+            f" but all the following weights are missing {missing}"
+        )
+
+    if tie_embeds:
+        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
+    else:
+        model.proj_out.weight.data = proj_out_weights
+
+    # determine those parameters from a model checkpoint as Whisper repo does
+    is_multilingual = model.config.vocab_size >= 51865
+    num_languages = model.config.vocab_size - 51765 - int(is_multilingual)
+
+    model.generation_config = _get_generation_config(
+        is_multilingual,
+        num_languages,
+        openai_version,
+    )
+
+    return model, is_multilingual, num_languages
+
+
+# Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
+def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> List[bytes]:
+    parts = [bytes([b]) for b in token]
+    while True:
+        min_idx = None
+        min_rank = None
+        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
+            rank = mergeable_ranks.get(pair[0] + pair[1])
+            if rank is not None and (min_rank is None or rank < min_rank):
+                min_idx = i
+                min_rank = rank
+        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
+            break
+        assert min_idx is not None
+        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
+    return parts
+
+
+def convert_tiktoken_bpe_to_hf(tiktoken_url: str):
+    bpe_ranks = load_tiktoken_bpe(tiktoken_url)
+    byte_encoder = bytes_to_unicode()
+
+    def token_bytes_to_string(b):
+        return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+    merges = []
+    vocab = {}
+    for token, rank in bpe_ranks.items():
+        vocab[token_bytes_to_string(token)] = rank
+        if len(token) == 1:
+            continue
+        merged = tuple(_bpe(bpe_ranks, token, max_rank=rank))
+        if len(merged) == 2:  # account for empty token
+            merges.append(" ".join(map(token_bytes_to_string, merged)))
+    return vocab, merges
+
+
+def convert_tiktoken_to_hf(
+    multilingual: bool = True, num_languages: int = 100, time_precision=0.02
+) -> WhisperTokenizer:
+    # requires whisper, unless we use the path to the tiktoken file
+    tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
+    start_of_transcript = ["<|endoftext|>", "<|startoftranscript|>"]
+    control_tokens = [
+        "<|translate|>",
+        "<|transcribe|>",
+        "<|startoflm|>",
+        "<|startofprev|>",
+        "<|nospeech|>",
+        "<|notimestamps|>",
+    ]
+    # these are special tokens, not normalized
+    language_tokens = [f"<|{k}|>" for k in list(LANGUAGES)[:num_languages]]
+    # These are not special but normalized
+    timestamp_tokens = [("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)]
+
+    vocab, merges = convert_tiktoken_bpe_to_hf(tiktoken_tokenizer_path)
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        vocab_file = f"{tmpdirname}/vocab.json"
+        merge_file = f"{tmpdirname}/merges.txt"
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens in merges:
+                writer.write(bpe_tokens + "\n")
+
+        hf_tokenizer = WhisperTokenizer(vocab_file, merge_file)
+
+    hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
+    hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
+    return hf_tokenizer
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # # Required parameters
+    parser.add_argument("--checkpoint_path", type=str, help="Path to the downloaded checkpoints")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--convert_preprocessor",
+        type=bool,
+        default=False,
+        help="Whether or not the preprocessor (tokenizer + feature extractor) should be converted along with the model.",
+    )
+    args = parser.parse_args()
+
+    model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
+        args.checkpoint_path, args.pytorch_dump_folder_path
+    )
+
+    if args.convert_preprocessor:
+        try:
+            if not _is_package_available("tiktoken"):
+                raise ModuleNotFoundError(
+                    """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
+                )
+        except Exception as e:
+            print(e)
+        else:
+            from tiktoken.load import load_tiktoken_bpe
+
+            tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
+            feature_extractor = WhisperFeatureExtractor(
+                feature_size=model.config.num_mel_bins,
+                # the rest of default parameters are the same as hardcoded in openai/whisper
+            )
+            processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+            processor.save_pretrained(args.pytorch_dump_folder_path)
+
+            # save fast tokenizer as well
+            fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
+            fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)
+
+    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/whisper/english_normalizer.py b/docs/transformers/build/lib/transformers/models/whisper/english_normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6aab4ad29d848f5fe0e237dd7fa8b5e76fa5e0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/english_normalizer.py
@@ -0,0 +1,595 @@
+# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
+# Most of the code is copy pasted from the original whisper repository
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import unicodedata
+from fractions import Fraction
+from typing import Iterator, List, Match, Optional, Union
+
+import regex
+
+
+# non-ASCII letters that are not separated by "NFKD" normalization
+ADDITIONAL_DIACRITICS = {
+    "œ": "oe",
+    "Œ": "OE",
+    "ø": "o",
+    "Ø": "O",
+    "æ": "ae",
+    "Æ": "AE",
+    "ß": "ss",
+    "ẞ": "SS",
+    "đ": "d",
+    "Đ": "D",
+    "ð": "d",
+    "Ð": "D",
+    "þ": "th",
+    "Þ": "th",
+    "ł": "l",
+    "Ł": "L",
+}
+
+
+def remove_symbols_and_diacritics(s: str, keep=""):
+    """
+    Replace any other markers, symbols, and punctuations with a space, and drop any diacritics (category 'Mn' and some
+    manual mappings)
+    """
+
+    def replace_character(char):
+        if char in keep:
+            return char
+        elif char in ADDITIONAL_DIACRITICS:
+            return ADDITIONAL_DIACRITICS[char]
+
+        elif unicodedata.category(char) == "Mn":
+            return ""
+
+        elif unicodedata.category(char)[0] in "MSP":
+            return " "
+
+        return char
+
+    return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))
+
+
+def remove_symbols(s: str):
+    """
+    Replace any other markers, symbols, punctuations with a space, keeping diacritics
+    """
+    return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))
+
+
+class BasicTextNormalizer:
+    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
+        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        self.split_letters = split_letters
+
+    def __call__(self, s: str):
+        s = s.lower()
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = self.clean(s).lower()
+
+        if self.split_letters:
+            s = " ".join(regex.findall(r"\X", s, regex.U))
+
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+
+        return s
+
+
+class EnglishNumberNormalizer:
+    """
+    Convert any spelled-out numbers into arabic numbers, while handling:
+
+    - remove any commas
+    - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
+    - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
+    - spell out `one` and `ones`
+    - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.zeros = {"o", "oh", "zero"}
+        # fmt: off
+        self.ones = {
+            name: i
+            for i, name in enumerate(
+                ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"],
+                start=1,
+            )
+        }
+        # fmt: on
+        self.ones_plural = {
+            "sixes" if name == "six" else name + "s": (value, "s") for name, value in self.ones.items()
+        }
+        self.ones_ordinal = {
+            "zeroth": (0, "th"),
+            "first": (1, "st"),
+            "second": (2, "nd"),
+            "third": (3, "rd"),
+            "fifth": (5, "th"),
+            "twelfth": (12, "th"),
+            **{
+                name + ("h" if name.endswith("t") else "th"): (value, "th")
+                for name, value in self.ones.items()
+                if value > 3 and value != 5 and value != 12
+            },
+        }
+        self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
+
+        self.tens = {
+            "twenty": 20,
+            "thirty": 30,
+            "forty": 40,
+            "fifty": 50,
+            "sixty": 60,
+            "seventy": 70,
+            "eighty": 80,
+            "ninety": 90,
+        }
+        self.tens_plural = {name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()}
+        self.tens_ordinal = {name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()}
+        self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
+
+        self.multipliers = {
+            "hundred": 100,
+            "thousand": 1_000,
+            "million": 1_000_000,
+            "billion": 1_000_000_000,
+            "trillion": 1_000_000_000_000,
+            "quadrillion": 1_000_000_000_000_000,
+            "quintillion": 1_000_000_000_000_000_000,
+            "sextillion": 1_000_000_000_000_000_000_000,
+            "septillion": 1_000_000_000_000_000_000_000_000,
+            "octillion": 1_000_000_000_000_000_000_000_000_000,
+            "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
+            "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
+        }
+        self.multipliers_plural = {name + "s": (value, "s") for name, value in self.multipliers.items()}
+        self.multipliers_ordinal = {name + "th": (value, "th") for name, value in self.multipliers.items()}
+        self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
+        self.decimals = {*self.ones, *self.tens, *self.zeros}
+
+        self.preceding_prefixers = {
+            "minus": "-",
+            "negative": "-",
+            "plus": "+",
+            "positive": "+",
+        }
+        self.following_prefixers = {
+            "pound": "£",
+            "pounds": "£",
+            "euro": "€",
+            "euros": "€",
+            "dollar": "$",
+            "dollars": "$",
+            "cent": "¢",
+            "cents": "¢",
+        }
+        self.prefixes = set(list(self.preceding_prefixers.values()) + list(self.following_prefixers.values()))
+        self.suffixers = {
+            "per": {"cent": "%"},
+            "percent": "%",
+        }
+        self.specials = {"and", "double", "triple", "point"}
+
+        self.words = {
+            key
+            for mapping in [
+                self.zeros,
+                self.ones,
+                self.ones_suffixed,
+                self.tens,
+                self.tens_suffixed,
+                self.multipliers,
+                self.multipliers_suffixed,
+                self.preceding_prefixers,
+                self.following_prefixers,
+                self.suffixers,
+                self.specials,
+            ]
+            for key in mapping
+        }
+        self.literal_words = {"one", "ones"}
+
+    def process_words(self, words: List[str]) -> Iterator[str]:
+        prefix: Optional[str] = None
+        value: Optional[Union[str, int]] = None
+        skip = False
+
+        def to_fraction(s: str):
+            try:
+                return Fraction(s)
+            except ValueError:
+                return None
+
+        def output(result: Union[str, int]):
+            nonlocal prefix, value
+            result = str(result)
+            if prefix is not None:
+                result = prefix + result
+            value = None
+            prefix = None
+            return result
+
+        if len(words) == 0:
+            return
+
+        for i, current in enumerate(words):
+            prev = words[i - 1] if i != 0 else None
+            next = words[i + 1] if i != len(words) - 1 else None
+            if skip:
+                skip = False
+                continue
+
+            next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
+            has_prefix = current[0] in self.prefixes
+            current_without_prefix = current[1:] if has_prefix else current
+            if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
+                # arabic numbers (potentially with signs and fractions)
+                f = to_fraction(current_without_prefix)
+                if f is None:
+                    raise ValueError("Converting the fraction failed")
+
+                if value is not None:
+                    if isinstance(value, str) and value.endswith("."):
+                        # concatenate decimals / ip address components
+                        value = str(value) + str(current)
+                        continue
+                    else:
+                        yield output(value)
+
+                prefix = current[0] if has_prefix else prefix
+                if f.denominator == 1:
+                    value = f.numerator  # store integers as int
+                else:
+                    value = current_without_prefix
+            elif current not in self.words:
+                # non-numeric words
+                if value is not None:
+                    yield output(value)
+                yield output(current)
+            elif current in self.zeros:
+                value = str(value or "") + "0"
+            elif current in self.ones:
+                ones = self.ones[current]
+
+                if value is None:
+                    value = ones
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:  # replace the last zero with the digit
+                        value = value[:-1] + str(ones)
+                    else:
+                        value = str(value) + str(ones)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        value += ones
+                    else:
+                        value = str(value) + str(ones)
+            elif current in self.ones_suffixed:
+                # ordinal or cardinal; yield the number right away
+                ones, suffix = self.ones_suffixed[current]
+                if value is None:
+                    yield output(str(ones) + suffix)
+                elif isinstance(value, str) or prev in self.ones:
+                    if prev in self.tens and ones < 10:
+                        yield output(value[:-1] + str(ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                elif ones < 10:
+                    if value % 10 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                else:  # eleven to nineteen
+                    if value % 100 == 0:
+                        yield output(str(value + ones) + suffix)
+                    else:
+                        yield output(str(value) + str(ones) + suffix)
+                value = None
+            elif current in self.tens:
+                tens = self.tens[current]
+                if value is None:
+                    value = tens
+                elif isinstance(value, str):
+                    value = str(value) + str(tens)
+                else:
+                    if value % 100 == 0:
+                        value += tens
+                    else:
+                        value = str(value) + str(tens)
+            elif current in self.tens_suffixed:
+                # ordinal or cardinal; yield the number right away
+                tens, suffix = self.tens_suffixed[current]
+                if value is None:
+                    yield output(str(tens) + suffix)
+                elif isinstance(value, str):
+                    yield output(str(value) + str(tens) + suffix)
+                else:
+                    if value % 100 == 0:
+                        yield output(str(value + tens) + suffix)
+                    else:
+                        yield output(str(value) + str(tens) + suffix)
+            elif current in self.multipliers:
+                multiplier = self.multipliers[current]
+                if value is None:
+                    value = multiplier
+                elif isinstance(value, str) or value == 0:
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        value = p.numerator
+                    else:
+                        yield output(value)
+                        value = multiplier
+                else:
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+            elif current in self.multipliers_suffixed:
+                multiplier, suffix = self.multipliers_suffixed[current]
+                if value is None:
+                    yield output(str(multiplier) + suffix)
+                elif isinstance(value, str):
+                    f = to_fraction(value)
+                    p = f * multiplier if f is not None else None
+                    if f is not None and p.denominator == 1:
+                        yield output(str(p.numerator) + suffix)
+                    else:
+                        yield output(value)
+                        yield output(str(multiplier) + suffix)
+                else:  # int
+                    before = value // 1000 * 1000
+                    residual = value % 1000
+                    value = before + residual * multiplier
+                    yield output(str(value) + suffix)
+                value = None
+            elif current in self.preceding_prefixers:
+                # apply prefix (positive, minus, etc.) if it precedes a number
+                if value is not None:
+                    yield output(value)
+
+                if next in self.words or next_is_numeric:
+                    prefix = self.preceding_prefixers[current]
+                else:
+                    yield output(current)
+            elif current in self.following_prefixers:
+                # apply prefix (dollars, cents, etc.) only after a number
+                if value is not None:
+                    prefix = self.following_prefixers[current]
+                    yield output(value)
+                else:
+                    yield output(current)
+            elif current in self.suffixers:
+                # apply suffix symbols (percent -> '%')
+                if value is not None:
+                    suffix = self.suffixers[current]
+                    if isinstance(suffix, dict):
+                        if next in suffix:
+                            yield output(str(value) + suffix[next])
+                            skip = True
+                        else:
+                            yield output(value)
+                            yield output(current)
+                    else:
+                        yield output(str(value) + suffix)
+                else:
+                    yield output(current)
+            elif current in self.specials:
+                if next not in self.words and not next_is_numeric:
+                    # apply special handling only if the next word can be numeric
+                    if value is not None:
+                        yield output(value)
+                    yield output(current)
+                elif current == "and":
+                    # ignore "and" after hundreds, thousands, etc.
+                    if prev not in self.multipliers:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "double" or current == "triple":
+                    if next in self.ones or next in self.zeros:
+                        repeats = 2 if current == "double" else 3
+                        ones = self.ones.get(next, 0)
+                        value = str(value or "") + str(ones) * repeats
+                        skip = True
+                    else:
+                        if value is not None:
+                            yield output(value)
+                        yield output(current)
+                elif current == "point":
+                    if next in self.decimals or next_is_numeric:
+                        value = str(value or "") + "."
+                else:
+                    # should all have been covered at this point
+                    raise ValueError(f"Unexpected token: {current}")
+            else:
+                # all should have been covered at this point
+                raise ValueError(f"Unexpected token: {current}")
+
+        if value is not None:
+            yield output(value)
+
+    def preprocess(self, s: str):
+        # replace "<number> and a half" with "<number> point five"
+        results = []
+
+        segments = re.split(r"\band\s+a\s+half\b", s)
+        for i, segment in enumerate(segments):
+            if len(segment.strip()) == 0:
+                continue
+            if i == len(segments) - 1:
+                results.append(segment)
+            else:
+                results.append(segment)
+                last_word = segment.rsplit(maxsplit=2)[-1]
+                if last_word in self.decimals or last_word in self.multipliers:
+                    results.append("point five")
+                else:
+                    results.append("and a half")
+
+        s = " ".join(results)
+
+        # put a space at number/letter boundary
+        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
+        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
+
+        # but remove spaces which could be a suffix
+        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
+
+        return s
+
+    def postprocess(self, s: str):
+        def combine_cents(m: Match):
+            try:
+                currency = m.group(1)
+                integer = m.group(2)
+                cents = int(m.group(3))
+                return f"{currency}{integer}.{cents:02d}"
+            except ValueError:
+                return m.string
+
+        def extract_cents(m: Match):
+            try:
+                return f"¢{int(m.group(1))}"
+            except ValueError:
+                return m.string
+
+        # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
+        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
+        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
+
+        # write "one(s)" instead of "1(s)", just for the readability
+        s = re.sub(r"\b1(s?)\b", r"one\1", s)
+
+        return s
+
+    def __call__(self, s: str):
+        s = self.preprocess(s)
+        s = " ".join(word for word in self.process_words(s.split()) if word is not None)
+        s = self.postprocess(s)
+
+        return s
+
+
+class EnglishSpellingNormalizer:
+    """
+    Applies British-American spelling mappings as listed in [1].
+
+    [1] https://www.tysto.com/uk-us-spelling-list.html
+    """
+
+    def __init__(self, english_spelling_mapping):
+        self.mapping = english_spelling_mapping
+
+    def __call__(self, s: str):
+        return " ".join(self.mapping.get(word, word) for word in s.split())
+
+
+class EnglishTextNormalizer:
+    def __init__(self, english_spelling_mapping):
+        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
+        self.replacers = {
+            # common contractions
+            r"\bwon't\b": "will not",
+            r"\bcan't\b": "can not",
+            r"\blet's\b": "let us",
+            r"\bain't\b": "aint",
+            r"\by'all\b": "you all",
+            r"\bwanna\b": "want to",
+            r"\bgotta\b": "got to",
+            r"\bgonna\b": "going to",
+            r"\bi'ma\b": "i am going to",
+            r"\bimma\b": "i am going to",
+            r"\bwoulda\b": "would have",
+            r"\bcoulda\b": "could have",
+            r"\bshoulda\b": "should have",
+            r"\bma'am\b": "madam",
+            # contractions in titles/prefixes
+            r"\bmr\b": "mister ",
+            r"\bmrs\b": "missus ",
+            r"\bst\b": "saint ",
+            r"\bdr\b": "doctor ",
+            r"\bprof\b": "professor ",
+            r"\bcapt\b": "captain ",
+            r"\bgov\b": "governor ",
+            r"\bald\b": "alderman ",
+            r"\bgen\b": "general ",
+            r"\bsen\b": "senator ",
+            r"\brep\b": "representative ",
+            r"\bpres\b": "president ",
+            r"\brev\b": "reverend ",
+            r"\bhon\b": "honorable ",
+            r"\basst\b": "assistant ",
+            r"\bassoc\b": "associate ",
+            r"\blt\b": "lieutenant ",
+            r"\bcol\b": "colonel ",
+            r"\bjr\b": "junior ",
+            r"\bsr\b": "senior ",
+            r"\besq\b": "esquire ",
+            # prefect tenses, ideally it should be any past participles, but it's harder..
+            r"'d been\b": " had been",
+            r"'s been\b": " has been",
+            r"'d gone\b": " had gone",
+            r"'s gone\b": " has gone",
+            r"'d done\b": " had done",  # "'s done" is ambiguous
+            r"'s got\b": " has got",
+            # general contractions
+            r"n't\b": " not",
+            r"'re\b": " are",
+            r"'s\b": " is",
+            r"'d\b": " would",
+            r"'ll\b": " will",
+            r"'t\b": " not",
+            r"'ve\b": " have",
+            r"'m\b": " am",
+        }
+        self.standardize_numbers = EnglishNumberNormalizer()
+        self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping)
+
+    def __call__(self, s: str):
+        s = s.lower()
+
+        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
+        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
+        s = re.sub(self.ignore_patterns, "", s)
+        s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe
+
+        for pattern, replacement in self.replacers.items():
+            s = re.sub(pattern, replacement, s)
+
+        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
+        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
+        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics
+
+        s = self.standardize_numbers(s)
+        s = self.standardize_spellings(s)
+
+        # now remove prefix/suffix symbols that are not preceded/followed by numbers
+        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
+        s = re.sub(r"([^0-9])%", r"\1 ", s)
+
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+
+        return s
diff --git a/docs/transformers/build/lib/transformers/models/whisper/feature_extraction_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/feature_extraction_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..af21fb7d3eca3dc8a1dd94d71585188c5831a719
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/feature_extraction_whisper.py
@@ -0,0 +1,338 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for Whisper
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+
+from ... import is_torch_available
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import TensorType, logging
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class WhisperFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a Whisper feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
+    Fourier Transform` which should match pytorch's `torch.stft` equivalent.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 80):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        hop_length (`int`, *optional*, defaults to 160):
+            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
+        chunk_length (`int`, *optional*, defaults to 30):
+            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
+            sequences.
+        n_fft (`int`, *optional*, defaults to 400):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+        dither (`float`, *optional*, defaults to 0.0):
+            Adds dithering. In other words, adds a small Gaussian noise to each frame.
+            E.g. use 0.0001 to add dithering with a normal distribution centered
+            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
+            The value 0.0 means no dithering.
+            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
+            the high log_mel_fbank values for signals with hard-zero sections,
+            when VAD cutoff is present in the signal.
+    """
+
+    model_input_names = ["input_features"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        hop_length=160,
+        chunk_length=30,
+        n_fft=400,
+        padding_value=0.0,
+        dither=0.0,
+        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.chunk_length = chunk_length
+        self.n_samples = chunk_length * sampling_rate
+        self.nb_max_frames = self.n_samples // hop_length
+        self.sampling_rate = sampling_rate
+        self.dither = dither
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=8000.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+    def _np_extract_fbank_features(self, waveform_batch: np.array, device: str) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        if device != "cpu":
+            raise ValueError(
+                f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator "
+                "devices requires torch, which is not installed. Either set `device='cpu'`, or "
+                "install torch according to the official instructions: https://pytorch.org/get-started/locally/"
+            )
+        log_spec_batch = []
+        for waveform in waveform_batch:
+            log_spec = spectrogram(
+                waveform,
+                window_function(self.n_fft, "hann"),
+                frame_length=self.n_fft,
+                hop_length=self.hop_length,
+                power=2.0,
+                dither=self.dither,
+                mel_filters=self.mel_filters,
+                log_mel="log10",
+            )
+            log_spec = log_spec[:, :-1]
+            log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+            log_spec = (log_spec + 4.0) / 4.0
+            log_spec_batch.append(log_spec)
+        log_spec_batch = np.array(log_spec_batch)
+        return log_spec_batch
+
+    def _torch_extract_fbank_features(self, waveform: np.array, device: str = "cpu") -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
+        yielding results similar to cpu computing with 1e-5 tolerance.
+        """
+        waveform = torch.from_numpy(waveform).to(device, torch.float32)
+        window = torch.hann_window(self.n_fft, device=device)
+
+        # Note: it would be better to dither the chunked waveform,
+        # so overlapping signal does not get the same dithering.
+        # But, chunking is happening inside pytorch, so it is here.
+        if self.dither != 0.0:
+            waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)
+
+        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
+        magnitudes = stft[..., :-1].abs() ** 2
+
+        mel_filters = torch.from_numpy(self.mel_filters).to(device, torch.float32)
+        mel_spec = mel_filters.T @ magnitudes
+
+        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+        if waveform.dim() == 2:
+            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
+            log_spec = torch.maximum(log_spec, max_val - 8.0)
+        else:
+            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+        log_spec = (log_spec + 4.0) / 4.0
+        if device != "cpu":
+            log_spec = log_spec.detach().cpu()
+        return log_spec.numpy()
+
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
+
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
+
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = None,
+        padding: Optional[str] = "max_length",
+        max_length: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        do_normalize: Optional[bool] = None,
+        device: Optional[str] = "cpu",
+        return_token_timestamps: Optional[bool] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
+        the STFT computation if available, otherwise a slower NumPy based one.
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            truncation (`bool`, *optional*, default to `True`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*, defaults to None):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
+                bugs.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+            padding_value (`float`, *optional*, defaults to 0.0):
+                The value that is used to fill the padding values / vectors.
+            do_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+                improve the performance of the model.
+            device (`str`, *optional*, defaults to `'cpu'`):
+                Specifies the device for computation of the log-mel spectrogram of audio signals in the
+                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
+            return_token_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether or not to return the number of frames of the input raw_speech.
+                These num_frames can be used by the model to compute word level timestamps.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        batched_speech = BatchFeature({"input_features": raw_speech})
+
+        # convert into correct format for padding
+
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length if max_length else self.n_samples,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask or do_normalize,
+        )
+
+        # zero-mean and unit-variance normalization
+        if do_normalize:
+            padded_inputs["input_features"] = self.zero_mean_unit_var_norm(
+                padded_inputs["input_features"],
+                attention_mask=padded_inputs["attention_mask"],
+                padding_value=self.padding_value,
+            )
+            padded_inputs["input_features"] = np.stack(padded_inputs["input_features"], axis=0)
+
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
+
+        extract_fbank_features = (
+            self._torch_extract_fbank_features if is_torch_available() else self._np_extract_fbank_features
+        )
+        input_features = extract_fbank_features(input_features[0], device)
+
+        if isinstance(input_features[0], List):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        else:
+            padded_inputs["input_features"] = input_features
+
+        if return_attention_mask:
+            # rescale from sample (48000) to feature (3000)
+            padded_inputs["attention_mask"] = padded_inputs["attention_mask"][:, :: self.hop_length]
+
+        if return_token_timestamps is not None:
+            padded_inputs["num_frames"] = [len(raw_speech_i) // self.hop_length for raw_speech_i in raw_speech]
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["WhisperFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/generation_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/generation_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e3db09a42cf2c56a81036ef516bf91ae9ace7d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/generation_whisper.py
@@ -0,0 +1,1983 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import math
+import warnings
+import zlib
+from typing import Callable, Iterator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.cache_utils import EncoderDecoderCache
+
+from ...generation import GenerationConfig, GenerationMixin
+from ...generation.logits_process import (
+    LogitsProcessorList,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor,
+    WhisperNoSpeechDetection,
+    WhisperTimeStampLogitsProcessor,
+)
+from ...generation.stopping_criteria import StoppingCriteriaList
+from ...modeling_outputs import BaseModelOutput
+from ...utils import logging
+from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
+
+
+logger = logging.get_logger(__name__)
+
+
+def _median_filter(inputs: torch.Tensor, filter_width: int) -> torch.Tensor:
+    """
+    Applies a median filter of width `filter_width` along the last dimension of the input.
+
+    The `inputs` tensor is assumed to be 3- or 4-dimensional.
+    """
+    if filter_width <= 0 or filter_width % 2 != 1:
+        raise ValueError("`filter_width` should be an odd number")
+
+    pad_width = filter_width // 2
+    if inputs.shape[-1] <= pad_width:
+        return inputs
+
+    # Pad the left and right edges.
+    inputs = nn.functional.pad(inputs, (pad_width, pad_width, 0, 0), mode="reflect")
+
+    # sort() is faster than torch.median (https://github.com/pytorch/pytorch/issues/51450)
+    result = inputs.unfold(-1, filter_width, 1).sort()[0][..., pad_width]
+    return result
+
+
+def _dynamic_time_warping(matrix: np.ndarray):
+    """
+    Measures similarity between two temporal sequences: the input audio and the output tokens. Used to generate
+    token-level timestamps.
+    """
+    output_length, input_length = matrix.shape
+    cost = np.ones((output_length + 1, input_length + 1), dtype=np.float32) * np.inf
+    trace = -np.ones((output_length + 1, input_length + 1), dtype=np.float32)
+
+    cost[0, 0] = 0
+    for j in range(1, input_length + 1):
+        for i in range(1, output_length + 1):
+            c0 = cost[i - 1, j - 1]
+            c1 = cost[i - 1, j]
+            c2 = cost[i, j - 1]
+
+            if c0 < c1 and c0 < c2:
+                c, t = c0, 0
+            elif c1 < c0 and c1 < c2:
+                c, t = c1, 1
+            else:
+                c, t = c2, 2
+
+            cost[i, j] = matrix[i - 1, j - 1] + c
+            trace[i, j] = t
+
+    # backtrace
+    i = trace.shape[0] - 1
+    j = trace.shape[1] - 1
+    trace[0, :] = 2
+    trace[:, 0] = 1
+
+    text_indices = []
+    time_indices = []
+    while i > 0 or j > 0:
+        text_indices.append(i - 1)
+        time_indices.append(j - 1)
+        if trace[i, j] == 0:
+            i -= 1
+            j -= 1
+        elif trace[i, j] == 1:
+            i -= 1
+        elif trace[i, j] == 2:
+            j -= 1
+        else:
+            raise RuntimeError(
+                f"Internal error in dynamic time warping. Unexpected trace[{i}, {j}]. Please file a bug report."
+            )
+
+    text_indices = np.array(text_indices)[::-1]
+    time_indices = np.array(time_indices)[::-1]
+    return text_indices, time_indices
+
+
+def _get_attr_from_logit_processors(logits_processor, logit_processor_class, attribute_name):
+    if logits_processor is not None:
+        logit_processor = next((cls for cls in logits_processor if isinstance(cls, logit_processor_class)), None)
+        if logit_processor:
+            return getattr(logit_processor, attribute_name, None)
+    return None
+
+
+def _pad_to_max_length(
+    current_segments,
+    pad_token_id,
+    device,
+    padding_side="right",
+    padding="longest",
+    bos_token_tensor=None,
+    cut_off_length=None,
+    return_token_timestamps=False,
+    force_unique_generate_call=False,
+):
+    max_total_length = 0
+    sequences = []
+    token_timestamps_list = []
+
+    if padding_side not in ["right", "left"]:
+        raise ValueError(f"`padding_side` must be either 'right' or 'left', not {padding_side}")
+
+    if padding not in ["longest", "max_length"]:
+        raise ValueError(f"`padding` must be either 'longest' or 'max_length', not {padding}")
+    elif padding == "max_length" and cut_off_length is None:
+        raise ValueError("`cut_off_length` must be specified when `padding='max_length'`")
+
+    if force_unique_generate_call:
+        sequences_list = []
+        timestamps_list = []
+        for segments in current_segments:
+            result = segments[0]["result"]
+            sequences_list.append(result if isinstance(result, torch.Tensor) else result["sequences"])
+            if return_token_timestamps:
+                timestamps_list.append(result["token_timestamps"])
+
+        sequences = torch.stack(sequences_list, dim=0)
+        if return_token_timestamps:
+            token_timestamps = torch.stack(timestamps_list, dim=0)
+            return sequences, token_timestamps
+        return sequences
+
+    for current_segment_list in current_segments:
+        if current_segment_list is not None and len([d["tokens"] for d in current_segment_list]) > 0:
+            sequence = torch.cat([d["tokens"] for d in current_segment_list], dim=-1)
+            if return_token_timestamps:
+                token_timestamps = torch.cat(
+                    [d["result"]["token_timestamps"][d["idxs"][0] : d["idxs"][1]] for d in current_segment_list],
+                    dim=-1,
+                )
+
+            if cut_off_length is not None:
+                sequence = sequence[-cut_off_length:]
+                if return_token_timestamps:
+                    token_timestamps = token_timestamps[-cut_off_length:]
+
+            if bos_token_tensor is not None:
+                sequence = torch.cat([bos_token_tensor, sequence])
+                if return_token_timestamps:
+                    token_timestamps = torch.cat(
+                        [torch.ones_like(bos_token_tensor, device=device) * 0.0, token_timestamps]
+                    )
+            sequences.append(sequence)
+            if return_token_timestamps:
+                token_timestamps_list.append(token_timestamps)
+            max_total_length = max(max_total_length, len(sequences[-1]))
+        elif bos_token_tensor is not None:
+            sequences.append(bos_token_tensor)
+            if return_token_timestamps:
+                token_timestamps_list.append(torch.ones_like(bos_token_tensor, device=device) * 0.0)
+        else:
+            sequences.append(torch.tensor([], device=device))
+            if return_token_timestamps:
+                token_timestamps_list.append(torch.tensor([], device=device))
+
+    max_total_length = cut_off_length + 1 if padding == "max_length" else max_total_length
+    for i in range(len(current_segments)):
+        pad_length = max_total_length - len(sequences[i])
+        pad = (0, pad_length) if padding_side == "right" else (pad_length, 0)
+
+        sequences[i] = F.pad(sequences[i], pad=pad, value=pad_token_id)
+        if return_token_timestamps:
+            token_timestamps_list[i] = F.pad(
+                token_timestamps_list[i],
+                pad=pad,
+                value=token_timestamps_list[i][-1] if len(token_timestamps_list[i]) > 0 else 0.0,
+            )
+
+    sequences = torch.stack(sequences, dim=0)
+
+    if return_token_timestamps:
+        token_timestamps = torch.stack(token_timestamps_list, dim=0)
+        return sequences, token_timestamps
+    else:
+        return sequences
+
+
+class WhisperGenerationMixin(GenerationMixin):
+    def _extract_token_timestamps(
+        self, generate_outputs, alignment_heads, time_precision=0.02, num_frames=None, num_input_ids=None
+    ):
+        """
+        Calculates token-level timestamps using the encoder-decoder cross-attentions and dynamic time-warping (DTW) to
+        map each output token to a position in the input audio. If `num_frames` is specified, the encoder-decoder
+        cross-attentions will be cropped before applying DTW.
+
+        Returns:
+            tensor containing the timestamps in seconds for each predicted token
+        """
+        # Create a list with `decoder_layers` elements, each a tensor of shape
+        # (batch size, attention_heads, output length, input length).
+        cross_attentions = []
+        for i in range(self.config.decoder_layers):
+            cross_attentions.append(torch.cat([x[i] for x in generate_outputs.cross_attentions], dim=2))
+
+        # Select specific cross-attention layers and heads. This is a tensor
+        # of shape (batch size, num selected, output length, input length).
+        weights = torch.stack([cross_attentions[l][:, h] for l, h in alignment_heads])
+        weights = weights.permute([1, 0, 2, 3])
+
+        weight_length = None
+
+        if "beam_indices" in generate_outputs:
+            # If beam search has been used, the output sequences may have been generated for more timesteps than their sequence_lengths
+            # since the beam search strategy chooses the most probable sequences at the end of the search.
+            # In that case, the cross_attentions weights are too long and we have to make sure that they have the right output_length
+            weight_length = (generate_outputs.beam_indices != -1).sum(-1).max()
+            weight_length = weight_length if num_input_ids is None else weight_length + num_input_ids
+
+            # beam search takes `decoder_input_ids` into account in the `beam_indices` length
+            # but forgot to shift the beam_indices by the number of `decoder_input_ids`
+            beam_indices = torch.zeros_like(generate_outputs.beam_indices[:, :weight_length])
+            # we actually shif the beam indices here
+            beam_indices[:, num_input_ids:] = generate_outputs.beam_indices[:, : weight_length - num_input_ids]
+
+            weights = weights[:, :, :weight_length]
+
+            # If beam index is still -1, it means that the associated token id is EOS
+            # We need to replace the index with 0 since index_select gives an error if any of the indexes is -1.
+            beam_indices = beam_indices.masked_fill(beam_indices == -1, 0)
+
+            # Select the cross attention from the right beam for each output sequences
+            weights = torch.stack(
+                [
+                    torch.index_select(weights[:, :, i, :], dim=0, index=beam_indices[:, i])
+                    for i in range(beam_indices.shape[1])
+                ],
+                dim=2,
+            )
+
+        # make sure timestamps are as long as weights
+        input_length = weight_length or cross_attentions[0].shape[2]
+        batch_size = generate_outputs.sequences.shape[0]
+        timestamps = torch.zeros(
+            (batch_size, input_length + 1), dtype=torch.float32, device=generate_outputs.sequences.device
+        )
+
+        if num_frames is not None:
+            # two cases:
+            # 1. num_frames is the same for each sample -> compute the DTW matrix for each sample in parallel
+            # 2. num_frames is different, compute the DTW matrix for each sample sequentially
+
+            # we're using np.unique because num_frames can be int/list/tuple
+            if isinstance(num_frames, int):
+                weights = weights[..., : num_frames // 2]
+
+            elif isinstance(num_frames, (list, tuple, np.ndarray)) and len(np.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+
+            elif isinstance(num_frames, (torch.Tensor)) and len(torch.unique(num_frames)) == 1:
+                weights = weights[..., : num_frames[0] // 2]
+
+            else:
+                # num_frames is of shape (batch_size,) whereas batch_size is truely batch_size*num_return_sequences
+                repeat_time = batch_size if isinstance(num_frames, int) else batch_size // len(num_frames)
+                num_frames = num_frames.cpu() if isinstance(num_frames, (torch.Tensor)) else num_frames
+                num_frames = np.repeat(num_frames, repeat_time)
+
+        if num_frames is None or isinstance(num_frames, int):
+            # Normalize and smoothen the weights.
+            std = torch.std(weights, dim=-2, keepdim=True, unbiased=False)
+            mean = torch.mean(weights, dim=-2, keepdim=True)
+            weights = (weights - mean) / std
+            weights = _median_filter(weights, self.config.median_filter_width)
+
+            # Average the different cross-attention heads.
+            weights = weights.mean(dim=1)
+
+        # Perform dynamic time warping on each element of the batch.
+        for batch_idx in range(batch_size):
+            if num_frames is not None and isinstance(num_frames, (tuple, list, np.ndarray, torch.Tensor)):
+                matrix = weights[batch_idx, ..., : num_frames[batch_idx] // 2]
+
+                # Normalize and smoothen the weights.
+                std = torch.std(matrix, dim=-2, keepdim=True, unbiased=False)
+                mean = torch.mean(matrix, dim=-2, keepdim=True)
+                matrix = (matrix - mean) / std
+                matrix = _median_filter(matrix, self.config.median_filter_width)
+
+                # Average the different cross-attention heads.
+                matrix = matrix.mean(dim=0)
+            else:
+                matrix = weights[batch_idx]
+
+            text_indices, time_indices = _dynamic_time_warping(-matrix.cpu().double().numpy())
+            jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
+            jump_times = time_indices[jumps] * time_precision
+            timestamps[batch_idx, 1:] = torch.tensor(jump_times)
+
+        return timestamps
+
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+        synced_gpus: bool = False,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[Union[str, List[str]]] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[torch.Tensor] = None,
+        prompt_condition_type: Optional[str] = None,  # first-segment, all-segments
+        condition_on_prev_tokens: Optional[bool] = None,
+        temperature: Optional[Union[float, Tuple[float, ...]]] = None,
+        compression_ratio_threshold: Optional[float] = None,
+        logprob_threshold: Optional[float] = None,
+        no_speech_threshold: Optional[float] = None,
+        num_segment_frames: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        time_precision: float = 0.02,
+        time_precision_features: float = 0.01,
+        return_token_timestamps: Optional[bool] = None,
+        return_segments: bool = False,
+        return_dict_in_generate: Optional[bool] = None,
+        force_unique_generate_call: Optional[bool] = None,
+        **kwargs,
+    ):
+        """
+        Transcribes or translates log-mel input features to a sequence of auto-regressively generated token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            generation_config ([`~generation.GenerationConfig`], *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str` or list of `str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. For
+                batched generation, a list of language tokens can be passed. You can find all the possible language
+                tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`torch.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            prompt_condition_type (`str`, *optional*):
+                Only relevant for long-form transcription. Condition type of `prompt_ids`. 'first-segment' means only the first segment is conditioned on `prompt_ids`. 'all-segments' means each segment is conditioned on `prompt_ids`. Make sure to enable `condition_on_prev_tokens` for 'all-segments'.
+                Defaults to 'first-segment'. For short-term transcription only 'first-segment' is possible.
+            condition_on_prev_tokens (`bool`, *optional*):
+                Only relevant for long-form transcription. Whether to condition each segment on the previous segment.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            temperature (`float` or list of `float`, *optional*):
+                The temperature to be used for generation. Passing a single `float` value and `do_sample=True` activates
+                generation using sampling. For long-form transcription, temperature fallback can be activated by passing
+                a list of float values such as (0.0, 0.2, 0.4, 0.6, 0.8, 1.0). As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            compression_ratio_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the zlib compression rate of each segment will be computed. If the compression rate of
+                a segment is higher than `compression_ratio_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
+                repeated using a higher temperature. The intuition behind this feature is that segments with very high compression rates
+                suffer from a lot of repetition. The unwanted repetition can be reduced by injecting more randomness by increasing the temperature. If `compression_ratio_threshold` is defined
+                make sure that `temperature` is a list of values. A common value for `compression_ratio_threshold` is 1.35.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            logprob_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the average log-probability of each segment will be computed. If the log-probability of
+                a given segment is lower than `logprob_threshold`, temperature fallback is activated: the generated segment is discarded and the generation is
+                repeated using a higher temperature. The intuition behind this feature is that segments of low log-probability
+                can be improved by injecting more randomness by increasing the temperature. If `logprob_threshold` is defined
+                make sure that `temperature` is a list of values. A common value for `logprob_threshold` is -1.0.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            no_speech_threshold (`float`, *optional*):
+                Only relevant for long-form transcription. If defined, the "no-speech" token combined with the `logprob_threshold`
+                is used to determine whether a segment contains only silence. In this case, the transcription for this segment
+                is skipped.
+                As shown in the [the Whisper paper](https://cdn.openai.com/papers/whisper.pdf), this can help to improve
+                performance.
+            num_segment_frames (`int`, *optional*):
+                The number of frames a single segment is made of. If not defined, `num_segment_frames` defaults to the model's stride
+                times the maximum input length.
+            attention_mask (`torch.Tensor`, *optional*):
+                `attention_mask` needs to be passed when doing long-form transcription using a batch size > 1.
+            time_precision (`int`, *optional*, defaults to 0.02):
+                The duration of output token in seconds. *E.g.* 0.02 means that a generated token on average accounts
+                for 20 ms.
+            time_precision_features (`int`, *optional*, defaults to 0.01):
+                The duration represented by a feature frame in seconds.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            return_segments (`bool`, *optional*, defaults to `False`):
+                Whether to additionally return a list of all segments. Note that this option can only be enabled
+                when doing long-form transcription.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of just returning the generated tokens.
+                Note that when doing long-form transcription, `return_dict_in_generate` can only be enabled when
+                `return_segments` is set True. In this case the generation outputs of each segment is added to each
+                segment.
+            force_unique_generate_call (`bool`, *optional*):
+                Whether to force a unique call to the underlying GenerationMixin's [`~generation.GenerationMixin.generate`] method. This is useful for assisted decoding and testing purposes to ensure
+                that only one call to [`~generation.GenerationMixin.generate`] is made and therefore decoder input token ids and eos token ids are returned.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+        Return:
+            [`~utils.ModelOutput`] or `Dict[str, Any]` or `torch.LongTensor`:
+
+                A:
+                - [`~utils.ModelOutput`] when `return_dict_in_generate=True` and (`return_timestamps=False` or `force_unique_generate_call=True`), including the decoder input ids and end of sequence id.
+                - `Dict[str, Any]` when (`return_dict_in_generate=True` and `return_timestamps=True`) or `return_segments=True` or `return_token_timestamps=True`.
+                - `torch.LongTensor` in all other cases, excluding the decoder input ids and end of sequence id.
+
+                The possible [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`]
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+
+                `segments` is a list of lists (one list per batch element) of `segment`.
+                A `segment` is a dictionary with keys `start`, `end`, `tokens`, `idxs`, and `result`.
+                - `start`: the start timestamp of the segment.
+                - `end`: the end timestamp of the segment.
+                - `tokens`: the tokens of the segment, excluding the decoder input ids and end of sequence id.
+                - `idxs`: the start (included) and end (excluded) indices of the `tokens` of the segment in the underlying call to GenerationMixin's [`~generation.GenerationMixin.generate`] (present in `result`).
+                - `result`: the result of the underlying call to GenerationMixin's [`~generation.GenerationMixin.generate`].
+
+                When `return_timestamps=True`, `return_dict_in_generate=True` applies to each call of the underlying GenerationMixin's [`~generation.GenerationMixin.generate`], with outputs stored in `result` of each `segment`.
+
+        Example:
+
+        - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate. It is necessary to set `return_timestamps=True`.
+        Indeed, long-form transcription uses a sequential algorithm based on timestamps predictions, with heuristics like compression ratio threshold, log probability threshold and temperature fallback. This algorithm is described in the [the Whisper original paper](https://cdn.openai.com/papers/whisper.pdf), section *3.8. Long-form Transcription*.
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset, Audio
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        >>> model.cuda()  # doctest: +IGNORE_RESULT
+
+        >>> # load audios > 30 seconds
+        >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        >>> # resample to 16kHz
+        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+        >>> # take first 8 audios and retrieve array
+        >>> audio = ds[:8]["audio"]
+        >>> audio = [x["array"] for x in audio]
+
+        >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
+        >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
+        >>> inputs = inputs.to("cuda", torch.float32)
+
+        >>> # transcribe audio to ids
+        >>> generated_ids = model.generate(**inputs)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        >>> transcription[0]
+        " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile."
+        ```
+
+        - *Shortform transcription*: If passed mel input features are <= 30 seconds, there are two possibilities:
+            - `return_timestamps=False`: the whole audio will be transcribed with a single call to GenerationMixin's [`~generation.GenerationMixin.generate`].
+            - `return_timestamps=True`: the audio will be transcribed using the same logic as long-form transcription.
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(inputs=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```
+
+        """
+        # 0. deprecate old inputs
+        if "inputs" in kwargs:
+            input_features = kwargs.pop("inputs")
+            warnings.warn(
+                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
+                FutureWarning,
+            )
+
+        # 1. prepare generation config
+        generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
+
+        # 2. set global generate variables
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+        num_segment_frames = input_stride * self.config.max_source_positions
+        batch_size, total_input_frames = self._retrieve_total_input_frames(
+            input_features=input_features, input_stride=input_stride, kwargs=kwargs
+        )
+        is_shortform = total_input_frames <= num_segment_frames
+
+        # 3. Make sure generation config is correctly set
+        # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+        return_dict_in_generate = self._set_return_outputs(
+            return_dict_in_generate=return_dict_in_generate,
+            return_token_timestamps=return_token_timestamps,
+            logprob_threshold=logprob_threshold,
+            generation_config=generation_config,
+        )
+        timestamp_begin = self._set_return_timestamps(
+            return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
+        )
+        self._set_language_and_task(
+            language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
+        )
+        self._set_num_frames(
+            return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
+        )
+        self._set_thresholds_and_condition(
+            generation_config=generation_config,
+            logprob_threshold=logprob_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            no_speech_threshold=no_speech_threshold,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+        )
+        self._set_prompt_condition_type(
+            generation_config=generation_config,
+            prompt_condition_type=prompt_condition_type,
+        )
+
+        # pass self.config for backward compatibility
+        init_tokens = self._retrieve_init_tokens(
+            input_features,
+            batch_size=batch_size,
+            generation_config=generation_config,
+            config=self.config,
+            num_segment_frames=num_segment_frames,
+            kwargs=kwargs,
+        )
+        # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
+        # where the input ids are handled explicitly by the generate method
+        self._check_decoder_input_ids(kwargs=kwargs)
+
+        # 3. Retrieve logits processors
+        device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
+        begin_index = init_tokens.shape[1]
+        num_beams = kwargs.get(
+            "num_beams",
+            generation_config.num_beams
+            if hasattr(generation_config, "num_beams") and generation_config.num_beams is not None
+            else 1,
+        )
+        if "assistant_model" in kwargs:
+            # speculative decoding: the model should be able to return eos token
+            generation_config.begin_suppress_tokens = None
+
+        logits_processor = self._retrieve_logit_processors(
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            begin_index=begin_index,  # begin index is index of first generated decoder token
+            num_beams=num_beams,
+            device=device,
+        )
+
+        # 4 Set and retrieve global generation variables
+        self._set_condition_on_prev_tokens(
+            condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
+        )
+
+        temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
+        temperature = temperatures[0]
+
+        max_frames, seek = self._retrieve_max_frames_and_seek(
+            batch_size=batch_size,
+            attention_mask=attention_mask,
+            total_input_frames=total_input_frames,
+            is_shortform=is_shortform,
+        )
+
+        # 5 Prepare running variables, list for generation
+        num_return_sequences = generation_config.num_return_sequences
+        (
+            batch_idx_map,
+            cur_bsz,
+            input_features,
+            seek,
+            max_frames,
+            init_tokens,
+            do_condition_on_prev_tokens,
+        ) = self._expand_variables_for_generation(
+            input_features=input_features,
+            seek=seek,
+            max_frames=max_frames,
+            init_tokens=init_tokens,
+            batch_size=batch_size,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+            generation_config=generation_config,
+        )
+
+        current_segments = self._prepare_segments(
+            prompt_ids=prompt_ids,
+            batch_size=cur_bsz,
+            generation_config=generation_config,
+        )
+        # 5bis speculative decoding: ensure the assistant model does only one call to generate and therefore returns decoder input token ids and eos token id
+        # we set a flag in the generation config to force the model to make only one call to generate and return the decoder input token ids and eos token id
+        if "assistant_model" in kwargs:
+            assistant_model = kwargs["assistant_model"]
+            assistant_model.generation_config.force_unique_generate_call = True
+
+        if force_unique_generate_call is None:
+            if hasattr(generation_config, "force_unique_generate_call"):
+                force_unique_generate_call = generation_config.force_unique_generate_call
+            elif hasattr(self.generation_config, "force_unique_generate_call"):
+                force_unique_generate_call = self.generation_config.force_unique_generate_call
+            else:
+                force_unique_generate_call = False
+
+        # 6 Transcribe audio until we reach the end of all input audios
+        while (seek < max_frames).any():
+            # 6.1 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+            # to know which original audio is being decoded
+            # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+            input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch(
+                input_features=input_features,
+                seek=seek,
+                max_frames=max_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            time_offset = (
+                seek.to(torch.float32 if device.type == "mps" else torch.float64) * time_precision / input_stride
+            )
+            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+
+            # 6.2 cut out next 30s segment from input features
+            segment_input = self._get_input_segment(
+                input_features=input_features,
+                seek=seek,
+                seek_num_frames=seek_num_frames,
+                num_segment_frames=num_segment_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+
+            # 6.3 prepare decoder input ids
+            suppress_tokens = _get_attr_from_logit_processors(
+                logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
+            )
+
+            decoder_input_ids, kwargs = self._prepare_decoder_input_ids(
+                cur_bsz=cur_bsz,
+                init_tokens=init_tokens,
+                current_segments=current_segments,
+                batch_idx_map=batch_idx_map,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                prompt_ids=prompt_ids,
+                generation_config=generation_config,
+                config=self.config,
+                device=init_tokens.device,
+                suppress_tokens=suppress_tokens,
+                timestamp_begin=timestamp_begin,
+                kwargs=kwargs,
+            )
+
+            # 6.4 set max new tokens or max length
+            self._set_max_new_tokens_and_length(
+                config=self.config,
+                decoder_input_ids=decoder_input_ids,
+                generation_config=generation_config,
+            )
+
+            # 6.5 Set current `begin_index` for all logit processors
+            if logits_processor is not None:
+                for proc in logits_processor:
+                    if hasattr(proc, "set_begin_index"):
+                        proc.set_begin_index(decoder_input_ids.shape[-1])
+
+            # 6.6 Run generate with fallback
+            (
+                seek_sequences,
+                seek_outputs,
+                should_skip,
+                do_condition_on_prev_tokens,
+                model_output_type,
+            ) = self.generate_with_fallback(
+                segment_input=segment_input,
+                decoder_input_ids=decoder_input_ids,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+                seek=seek,
+                num_segment_frames=num_segment_frames,
+                max_frames=max_frames,
+                temperatures=temperatures,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                return_token_timestamps=return_token_timestamps,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                is_shortform=is_shortform,
+                batch_size=batch_size,
+                attention_mask=attention_mask,
+                kwargs=kwargs,
+            )
+
+            # 6.7 In every generated sequence, split by timestamp tokens and extract segments
+            for i, seek_sequence in enumerate(seek_sequences):
+                prev_i = batch_idx_map[i]
+
+                if should_skip[i]:
+                    seek[prev_i] += seek_num_frames[prev_i]
+                    continue
+
+                segments, segment_offset = self._retrieve_segment(
+                    seek_sequence=seek_sequence,
+                    seek_outputs=seek_outputs,
+                    time_offset=time_offset,
+                    timestamp_begin=timestamp_begin,
+                    seek_num_frames=seek_num_frames,
+                    time_precision=time_precision,
+                    time_precision_features=time_precision_features,
+                    input_stride=input_stride,
+                    prev_idx=prev_i,
+                    idx=i,
+                    return_token_timestamps=return_token_timestamps,
+                    decoder_input_ids=decoder_input_ids,
+                )
+
+                seek[prev_i] += segment_offset
+
+                current_segments[prev_i] += segments
+
+            if force_unique_generate_call:
+                break
+
+        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
+        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
+        final_segments = (
+            [x[1:] for x in current_segments]
+            if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
+            else current_segments
+        )
+
+        # if return_dict_in_generate=True and we forced a unique call to generate or return_timestamps=False, meaning we are sure only one call to generate has been made,
+        # -> we can return a ModelOutput
+        # otherwise, return_dict_in_generate is applied in the 'result' of each segment in final_segments
+        if (
+            return_dict_in_generate
+            and generation_config.return_dict_in_generate
+            and (force_unique_generate_call or not return_timestamps)
+        ):
+            # only one call to generate_with_fallback, we can return a ModelOutput
+            outputs = self._stack_split_outputs(seek_outputs, model_output_type, self.device, kwargs)
+            if num_return_sequences > 1:
+                if hasattr(outputs, "encoder_attentions") and outputs.encoder_attentions is not None:
+                    outputs.encoder_attentions = tuple(
+                        outputs.encoder_attentions[i][::num_return_sequences]
+                        for i in range(len(outputs.encoder_attentions))
+                    )
+                if hasattr(outputs, "encoder_hidden_states") and outputs.encoder_hidden_states is not None:
+                    outputs.encoder_hidden_states = tuple(
+                        outputs.encoder_hidden_states[i][::num_return_sequences]
+                        for i in range(len(outputs.encoder_hidden_states))
+                    )
+            return outputs
+
+        padded_outputs = _pad_to_max_length(
+            current_segments=final_segments,
+            pad_token_id=generation_config.pad_token_id,
+            device=self.device,
+            padding_side="right",
+            return_token_timestamps=return_token_timestamps,
+            force_unique_generate_call=force_unique_generate_call,
+        )
+
+        if return_dict_in_generate and generation_config.return_dict_in_generate:
+            logger.warning_once(
+                "You have passed `return_dict_in_generate=True` and `return_timestamps=True`, this automatically sets `return_segments=True` to access the resuls of the underlying calls to GenerationMixin's generate in the returned `segments`."
+            )
+            return_segments = True
+        elif not return_segments and not return_token_timestamps:
+            return padded_outputs
+
+        if return_token_timestamps:
+            sequences, token_timestamps = padded_outputs
+            outputs = {
+                "sequences": sequences,
+                "token_timestamps": token_timestamps,
+            }
+        else:
+            sequences = padded_outputs
+            outputs = {
+                "sequences": sequences,
+            }
+
+        if return_segments:
+            outputs["segments"] = final_segments
+
+        return outputs
+
+    def generate_with_fallback(
+        self,
+        segment_input,
+        decoder_input_ids,
+        cur_bsz,
+        batch_idx_map,
+        seek,
+        num_segment_frames,
+        max_frames,
+        temperatures,
+        generation_config,
+        logits_processor,
+        stopping_criteria,
+        prefix_allowed_tokens_fn,
+        synced_gpus,
+        return_token_timestamps,
+        do_condition_on_prev_tokens,
+        is_shortform,
+        batch_size,
+        attention_mask,
+        kwargs,
+    ):
+        kwargs = copy.copy(kwargs)
+
+        # 6.6 Batch generate current chunk
+        seek_sequence_list = [None for _ in range(cur_bsz)]
+        seek_outputs_list = [None for _ in range(cur_bsz)]
+        needs_fallback = [False for _ in range(cur_bsz)]
+        should_skip = [False for _ in range(cur_bsz)]
+        fallback_index_map = list(range(cur_bsz))
+        if generation_config.no_speech_threshold is not None:
+            self._setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs)
+
+        for fallback_idx, temperature in enumerate(temperatures):
+            generation_config.do_sample = temperature is not None and temperature > 0.0
+            generation_config.temperature = temperature if generation_config.do_sample else 1.0
+            if generation_config.do_sample:
+                generation_config.num_beams = 1
+
+            generate_kwargs = copy.copy(kwargs)
+            for key in ["do_sample", "temperature", "num_beams"]:
+                if key in generate_kwargs:
+                    del generate_kwargs[key]
+
+            cur_bsz = decoder_input_ids.shape[0]
+            if generation_config.cache_implementation == "static" and cur_bsz < batch_size:
+                segment_input = F.pad(segment_input, (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0)
+                decoder_input_ids = F.pad(
+                    decoder_input_ids, (0, 0, 0, batch_size - cur_bsz), value=generation_config.pad_token_id
+                )
+                if generate_kwargs.get("decoder_attention_mask") is not None:
+                    generate_kwargs["decoder_attention_mask"] = F.pad(
+                        generate_kwargs["decoder_attention_mask"], (0, 0, 0, batch_size - cur_bsz), value=True
+                    )
+                if generate_kwargs.get("encoder_outputs") is not None:
+                    generate_kwargs["encoder_outputs"] = F.pad(
+                        generate_kwargs["encoder_outputs"], (0, 0, 0, 0, 0, batch_size - cur_bsz), value=0
+                    )
+
+            seek_outputs = super().generate(
+                segment_input,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                decoder_input_ids=decoder_input_ids,
+                attention_mask=attention_mask,
+                **generate_kwargs,
+            )
+
+            model_output_type = type(seek_outputs)
+
+            # post-process sequence tokens and outputs to be in list form
+            seek_sequences, seek_outputs = self._postprocess_outputs(
+                seek_outputs=seek_outputs,
+                decoder_input_ids=decoder_input_ids,
+                return_token_timestamps=return_token_timestamps,
+                generation_config=generation_config,
+                is_shortform=is_shortform,
+            )
+
+            if cur_bsz < batch_size:
+                seek_sequences = seek_sequences[:cur_bsz]
+                seek_outputs = seek_outputs[:cur_bsz]
+
+            # 6.7 Extract cut sequences from every sequence and check if fallback should be applied
+            # Loop over each decoded audio individually as each decoding can be of a different length
+            new_fallback_index_map = []
+            new_segment_input = []
+            new_decoder_input_ids = []
+            new_decoder_attention_mask = []
+
+            for i, seek_sequence in enumerate(seek_sequences):
+                # remove all padding tokens, except for the eos token
+                if seek_sequence[-1] == generation_config.pad_token_id:
+                    num_paddings = (seek_sequence == generation_config.pad_token_id).sum()
+                    if generation_config.pad_token_id == generation_config.eos_token_id:
+                        # we do not remove the eos token id since it is needed for avg logprob calculation in _need_fallback
+                        num_paddings -= 1
+                    if num_paddings != 0:
+                        seek_sequence = seek_sequence[:-num_paddings]
+
+                # check which sequences in batch need fallback & which should be skipped
+                needs_fallback[i], should_skip[i] = self._need_fallback(
+                    seek_sequence,
+                    seek_outputs,
+                    i,
+                    logits_processor,
+                    generation_config,
+                    self.config.vocab_size,
+                    temperature,
+                )
+
+                # remove eos token
+                if seek_sequence[-1] == generation_config.eos_token_id:
+                    seek_sequence = seek_sequence[:-1]
+
+                seek_sequence_list[fallback_index_map[i]] = seek_sequence
+                seek_outputs_list[fallback_index_map[i]] = seek_outputs[i]
+                is_low_temperature = temperature is None or temperature < 0.5
+                do_condition_on_prev_tokens[fallback_index_map[i]] = (
+                    generation_config.condition_on_prev_tokens and is_low_temperature
+                )
+
+                if needs_fallback[i]:
+                    new_fallback_index_map.append(fallback_index_map[i])
+                    new_segment_input.append(segment_input[i])
+                    new_decoder_input_ids.append(decoder_input_ids[i])
+                    if "decoder_attention_mask" in kwargs:
+                        new_decoder_attention_mask.append(kwargs["decoder_attention_mask"][i])
+
+            fallback_index_map = new_fallback_index_map
+
+            # if no sequence needs to be run with temperature fallback, we're finished
+            if len(fallback_index_map) == 0 or fallback_idx == len(temperatures) - 1:
+                seek_sequences = seek_sequence_list
+                seek_outputs = seek_outputs_list
+                break
+
+            # if we're still in the loop, make sure that decoder_input_ids and segment inputs are tensors
+            decoder_input_ids = torch.stack(new_decoder_input_ids)
+            segment_input = torch.stack(new_segment_input)
+            if "decoder_attention_mask" in kwargs:
+                kwargs["decoder_attention_mask"] = torch.stack(new_decoder_attention_mask)
+
+        return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens, model_output_type
+
+    @staticmethod
+    def _prepare_segments(prompt_ids, batch_size, generation_config):
+        if prompt_ids is not None and generation_config.prompt_condition_type == "first-segment":
+            prev_sot_token_id = getattr(generation_config, "prev_sot_token_id", None)
+            prompt_ids = prompt_ids[1:] if prompt_ids[0] == prev_sot_token_id else prompt_ids
+            current_segments = [[{"tokens": prompt_ids}] for _ in range(batch_size)]
+        else:
+            current_segments = [[] for _ in range(batch_size)]
+
+        return current_segments
+
+    def _postprocess_outputs(
+        self,
+        seek_outputs,
+        decoder_input_ids,
+        return_token_timestamps,
+        generation_config,
+        is_shortform,
+    ):
+        # remove all previously passed decoder input ids
+        # should happen only if it is the first generated segment
+        start_idx = decoder_input_ids.shape[-1]
+
+        if isinstance(seek_outputs, torch.Tensor):
+            return seek_outputs[:, start_idx:], seek_outputs
+
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            num_frames = getattr(generation_config, "num_frames", None)
+            seek_outputs["token_timestamps"] = self._extract_token_timestamps(
+                seek_outputs,
+                generation_config.alignment_heads,
+                num_frames=num_frames,
+                num_input_ids=decoder_input_ids.shape[-1],
+            )
+
+        def split_by_batch_index(values, key, batch_idx, is_shortform, beam_indices=None):
+            if beam_indices is not None and key == "scores":
+                return [v[beam_idx].cpu() for (v, beam_idx) in zip(values, beam_indices[batch_idx][: len(values)])]
+            if key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+                return [v[batch_idx].cpu() for v in values]
+            if key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+                return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values)
+            elif key == "past_key_values":
+                if not is_shortform:
+                    # we don't save `past_key_values` as this is too costly for longform
+                    return None
+                elif isinstance(values, EncoderDecoderCache):
+                    all_past_key_values = []
+                    for layer_idx in range(self.config.decoder_layers):
+                        layer_past_key_values = []
+                        for cache_cls in [values.self_attention_cache, values.cross_attention_cache]:
+                            for v in [cache_cls.key_cache, cache_cls.value_cache]:
+                                layer_past_key_values.append(v[layer_idx][batch_idx][None].cpu())
+                        all_past_key_values.append(tuple(layer_past_key_values))
+                    return tuple(all_past_key_values)
+                else:
+                    all_past_key_values = []
+                    for v in range(len(values)):
+                        layer_past_key_values = []
+                        for w in values[v]:
+                            if len(w) != 0:
+                                layer_past_key_values.append(w[batch_idx][None].cpu())
+                            else:
+                                layer_past_key_values.append(w)
+                        all_past_key_values.append(tuple(layer_past_key_values))
+                    return tuple(all_past_key_values)
+
+            return values[batch_idx].cpu()
+
+        sequence_tokens = seek_outputs["sequences"][:, start_idx:]
+        seek_outputs = [
+            {
+                k: split_by_batch_index(v, k, i, is_shortform, beam_indices=seek_outputs.get("beam_indices"))
+                for k, v in seek_outputs.items()
+            }
+            for i in range(sequence_tokens.shape[0])
+        ]
+
+        return sequence_tokens, seek_outputs
+
+    def _stack_split_outputs(self, seek_outputs, model_output_type, device, kwargs):
+        # Stack back seek_outputs tensors after splitting them with the split_by_batch_index method
+        outputs = {}
+        for key in seek_outputs[0].keys():
+            if key in ["sequences", "beam_indices", "token_timestamps"]:
+                outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
+            elif key in ["scores", "encoder_attentions", "encoder_hidden_states", "logits"]:
+                outputs[key] = tuple(
+                    torch.stack([v[key][i] for v in seek_outputs]).to(device) for i in range(len(seek_outputs[0][key]))
+                )
+            elif key == "sequences_scores":
+                outputs[key] = torch.stack([v[key] for v in seek_outputs], dim=0).to(device)
+            elif key in ["decoder_attentions", "decoder_hidden_states", "cross_attentions"]:
+                outputs[key] = tuple(
+                    tuple(
+                        torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device)
+                        for j in range(len(seek_outputs[0][key][0]))
+                    )
+                    for i in range(len(seek_outputs[0][key]))
+                )
+            elif key == "past_key_values":
+                past_key_value_type = kwargs.get("past_key_values")
+                if seek_outputs[0][key] is not None:
+                    outputs[key] = tuple(
+                        tuple(
+                            torch.stack([v[key][i][j] for v in seek_outputs]).squeeze(1).to(device)
+                            for j in range(len(seek_outputs[0][key][0]))
+                        )
+                        for i in range(len(seek_outputs[0][key]))
+                    )
+                    if past_key_value_type is not None and isinstance(past_key_value_type, EncoderDecoderCache):
+                        outputs[key] = past_key_value_type.from_legacy_cache(outputs[key])
+                else:
+                    outputs[key] = None
+
+        token_timestamps = outputs.get("token_timestamps", None)
+        if token_timestamps is not None:
+            model_output_type = dict
+
+        return model_output_type(**outputs)
+
+    def _need_fallback(
+        self,
+        seek_sequence,
+        seek_outputs,
+        index,
+        logits_processor,
+        generation_config,
+        vocab_size,
+        temperature,
+    ):
+        needs_fallback = False
+        should_skip = False
+        if generation_config.compression_ratio_threshold is not None:
+            compression_ratio = self._retrieve_compression_ratio(seek_sequence, vocab_size)
+
+            if compression_ratio > generation_config.compression_ratio_threshold:
+                needs_fallback = True
+
+        if generation_config.logprob_threshold is not None:
+            if hasattr(seek_outputs[0], "sequences_scores"):
+                logprobs = [s["sequences_scores"] for s in seek_outputs][index]
+            else:
+                scores = seek_outputs[index]["scores"]
+                logprobs = self._retrieve_avg_logprobs(
+                    scores,
+                    seek_sequence,
+                    temperature,
+                )
+
+            if logprobs < generation_config.logprob_threshold:
+                needs_fallback = True
+
+        if generation_config.no_speech_threshold is not None:
+            no_speech_prob = _get_attr_from_logit_processors(
+                logits_processor, WhisperNoSpeechDetection, "no_speech_prob"
+            )
+
+            if (
+                logprobs < generation_config.logprob_threshold
+                and no_speech_prob[index] > generation_config.no_speech_threshold
+            ):
+                needs_fallback = False
+                should_skip = True
+
+        return needs_fallback, should_skip
+
+    def _expand_variables_for_generation(
+        self, input_features, seek, max_frames, init_tokens, batch_size, condition_on_prev_tokens, generation_config
+    ):
+        if generation_config.num_return_sequences is not None and generation_config.num_return_sequences > 1:
+            batch_idx_map = list(range(batch_size * generation_config.num_return_sequences))
+            cur_bsz = len(batch_idx_map)
+            do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(len(batch_idx_map))]
+            input_features = input_features.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            seek = seek.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            max_frames = max_frames.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            init_tokens = init_tokens.repeat_interleave(generation_config.num_return_sequences, dim=0)
+            generation_config.num_return_sequences = 1
+        else:
+            cur_bsz = batch_size
+            batch_idx_map = list(range(cur_bsz))
+            do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(cur_bsz)]
+
+        return (
+            batch_idx_map,
+            cur_bsz,
+            input_features,
+            seek,
+            max_frames,
+            init_tokens,
+            do_condition_on_prev_tokens,
+        )
+
+    @staticmethod
+    def _setup_no_speech_detection(logits_processor, segment_input, decoder_input_ids, kwargs):
+        set_inputs = _get_attr_from_logit_processors(logits_processor, WhisperNoSpeechDetection, "set_inputs")
+        extra_kwargs = {k: v for k, v in kwargs.items() if torch.is_tensor(v)}
+        set_inputs({"inputs": segment_input, "input_ids": decoder_input_ids, **extra_kwargs})
+
+    @staticmethod
+    def _retrieve_total_input_frames(input_features, input_stride, kwargs):
+        if input_features is not None:
+            return input_features.shape[0], input_features.shape[-1]
+
+        if "encoder_outputs" in kwargs:
+            encoder_outputs_shape = (
+                kwargs["encoder_outputs"][0].shape
+                if isinstance(kwargs["encoder_outputs"], BaseModelOutput)
+                else kwargs["encoder_outputs"].shape
+            )
+            return encoder_outputs_shape[0], encoder_outputs_shape[1] * input_stride
+
+        raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.")
+
+    @staticmethod
+    def _maybe_warn_unused_inputs(
+        condition_on_prev_tokens,
+        temperature,
+        compression_ratio_threshold,
+        logprob_threshold,
+        no_speech_threshold,
+        total_input_frames,
+    ):
+        warning_prefix = (
+            f"Audio input consists of only {total_input_frames}. "
+            "Short-form transcription is activated."
+            "{}, but will be ignored."
+        )
+        if condition_on_prev_tokens is not None:
+            logger.warning(warning_prefix.format(f"condition_on_prev_tokens is set to {condition_on_prev_tokens}"))
+
+        if compression_ratio_threshold is not None:
+            logger.warning(
+                warning_prefix.format(f"compression_ratio_threshold is set to {compression_ratio_threshold}")
+            )
+
+        if logprob_threshold is not None:
+            logger.warning(warning_prefix.format(f"logprob_threshold is set to {logprob_threshold}"))
+
+        if no_speech_threshold is not None:
+            logger.warning(warning_prefix.format(f"no_speech_threshold is set to {no_speech_threshold}"))
+
+    @staticmethod
+    def _set_return_outputs(return_dict_in_generate, return_token_timestamps, logprob_threshold, generation_config):
+        if return_dict_in_generate is None:
+            return_dict_in_generate = generation_config.return_dict_in_generate
+        else:
+            generation_config.return_dict_in_generate = return_dict_in_generate
+
+        generation_config.return_token_timestamps = return_token_timestamps
+        if return_token_timestamps:
+            generation_config.return_dict_in_generate = True
+            generation_config.output_attentions = True
+            generation_config.output_scores = True
+
+        if logprob_threshold is not None:
+            generation_config.return_dict_in_generate = True
+            generation_config.output_scores = True
+
+        return return_dict_in_generate
+
+    def _set_return_timestamps(self, return_timestamps, is_shortform, generation_config):
+        if return_timestamps is None and hasattr(generation_config, "return_timestamps"):
+            return_timestamps = generation_config.return_timestamps
+
+        if not is_shortform:
+            if return_timestamps is False:
+                raise ValueError(
+                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
+                    "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features."
+                )
+
+            logger.info("Setting `return_timestamps=True` for long-form generation.")
+            return_timestamps = True
+
+        if return_timestamps and not hasattr(generation_config, "no_timestamps_token_id"):
+            raise ValueError(
+                "You are trying to return timestamps, but the generation config is not properly set. "
+                "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
+                "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+            )
+
+        generation_config.return_timestamps = return_timestamps
+
+        if hasattr(generation_config, "no_timestamps_token_id"):
+            timestamp_begin = generation_config.no_timestamps_token_id + 1
+        else:
+            # BC for models missing the `no_timestamps_token_id` in the generation config when generating short-form with no timestamps
+            # We set the timestamp begin token larger than the vocab size, such that the timestamp condition is never met in the decoding loop
+            timestamp_begin = self.config.vocab_size + 1
+
+        return timestamp_begin
+
+    @staticmethod
+    def _set_language_and_task(language, task, is_multilingual, generation_config):
+        if is_multilingual is not None:
+            if not hasattr(generation_config, "is_multilingual"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `is_multilingual` argument "
+                    "to `generate`. Please update the generation config as per the instructions "
+                    "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.is_multilingual = is_multilingual
+
+        if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual:
+            if task is not None or language is not None:
+                raise ValueError(
+                    "Cannot specify `task` or `language` for an English-only model. If the model is intended to be "
+                    "multilingual, pass `is_multilingual=True` to generate, or update the generation config."
+                )
+
+        if language is not None:
+            if not hasattr(generation_config, "lang_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `language` argument "
+                    "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.language = language
+
+        if task is not None:
+            if not hasattr(generation_config, "task_to_id"):
+                raise ValueError(
+                    "The generation config is outdated and is thus not compatible with the `task` argument "
+                    "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, "
+                    "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224"
+                )
+            generation_config.task = task
+
+    def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs):
+        def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
+            """short function to replace num with a itr in lst"""
+            found = any(i in lst for i in itr)
+            if found:
+                lst = [num if i in itr else i for i in lst]
+            else:
+                lst.append(num)
+            return lst
+
+        def language_to_id(language: str) -> int:
+            language = language.lower()
+            if language in generation_config.lang_to_id.keys():
+                language_token = language
+            elif language in TO_LANGUAGE_CODE.keys():
+                language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+            elif language in TO_LANGUAGE_CODE.values():
+                language_token = f"<|{language}|>"
+            else:
+                is_language_code = len(language) == 2
+                raise ValueError(
+                    f"Unsupported language: {language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+            if language_token not in generation_config.lang_to_id:
+                raise ValueError(
+                    f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                    "(You should just add it to the generation config)"
+                )
+
+            return generation_config.lang_to_id[language_token]
+
+        task = getattr(generation_config, "task", None)
+        language = getattr(generation_config, "language", None)
+
+        forced_decoder_ids = generation_config.forced_decoder_ids
+        if forced_decoder_ids is not None:
+            if language is None and task is None and forced_decoder_ids[0][1] is None:
+                logger.warning_once(
+                    "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English."
+                    "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`."
+                )
+        elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
+            forced_decoder_ids = config.forced_decoder_ids
+
+        if forced_decoder_ids is not None and task is not None:
+            logger.warning_once(
+                f"You have passed task={task}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of task={task}."
+            )
+            forced_decoder_ids = None
+        elif forced_decoder_ids is not None and language is not None:
+            logger.warning_once(
+                f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
+            )
+            forced_decoder_ids = None
+
+        init_tokens = [generation_config.decoder_start_token_id]
+        if forced_decoder_ids is not None and forced_decoder_ids[0][0] == 1:
+            i = 1
+            while len(forced_decoder_ids) > 0 and forced_decoder_ids[0][0] == i:
+                init_tokens += [forced_decoder_ids[0][1]]
+                forced_decoder_ids = forced_decoder_ids[1:]
+                i += 1
+
+            if len(forced_decoder_ids) > 0:
+                raise ValueError(
+                    f"You are using token ids in `forced_decoder_ids` that do not seem to correctly follow the prompt pattern of Whisper. Make sure that {forced_decoder_ids} has an entry for all indices >= 1 and < {forced_decoder_ids[0][0]}.",
+                )
+
+        # from v4.39 the forced decoder ids are always None in favour of decoder input ids
+        generation_config.forced_decoder_ids = None
+
+        is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None)
+
+        # Make sure language is a list of strings of the correct length
+        if isinstance(language, (list, tuple)):
+            if any(l is None for l in language):
+                raise TypeError(
+                    "Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`."
+                )
+            if len(language) != batch_size:
+                raise ValueError(
+                    "When passing a list of languages, the length of the list must match the batch size. "
+                    f"Expected length of {batch_size}, but got {len(language)} languages."
+                )
+            languages = language
+        elif language is None:
+            # Language will be detected for each item in batch
+            languages = [None] * batch_size
+        else:
+            languages = [language]  # Use a length-1 list now, broadcast later
+
+        # Separate init_tokens for each language
+        init_tokens = [copy.copy(init_tokens) for _ in languages]
+
+        # Update init_tokens with languages
+        lang_ids = None
+        if language is not None:
+            lang_ids = [language_to_id(l) for l in languages]
+        elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
+            # language is not defined or intentially set to `None` to trigger language detection
+            lang_ids = self.detect_language(
+                input_features=input_features,
+                encoder_outputs=kwargs.get("encoder_outputs", None),
+                generation_config=generation_config,
+                num_segment_frames=num_segment_frames,
+            ).tolist()
+        if lang_ids is not None:
+            # append or replace lang_ids to init_tokens
+            for i in range(len(init_tokens)):
+                if len(init_tokens[i]) > 1:
+                    init_tokens[i][1] = lang_ids[i]
+                else:
+                    init_tokens[i].append(lang_ids[i])
+        del languages
+
+        # Update init_tokens with task
+        for i in range(len(init_tokens)):
+            if task is not None:
+                if task in TASK_IDS:
+                    init_tokens[i].append(generation_config.task_to_id[generation_config.task])
+                    task_id = generation_config.task_to_id[generation_config.task]
+
+                    # if task is defined it'll overwrite task ids that might have already been defined via the generation_config
+                    replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values())
+                else:
+                    raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`")
+            elif language is not None and hasattr(generation_config, "task_to_id"):
+                # if language is defined, but no task id is in `init_tokens`, default to transcribe
+                if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()):
+                    init_tokens[i].append(generation_config.task_to_id["transcribe"])
+
+            if (
+                not generation_config.return_timestamps
+                and hasattr(generation_config, "no_timestamps_token_id")
+                and init_tokens[i][-1] != generation_config.no_timestamps_token_id
+            ):
+                init_tokens[i].append(generation_config.no_timestamps_token_id)
+            elif (
+                generation_config.return_timestamps and init_tokens[i][-1] == generation_config.no_timestamps_token_id
+            ):
+                logger.info(
+                    "<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `'True'`."
+                )
+                init_tokens[i] = init_tokens[i][:-1]
+
+            # let's make sure we don't pass `None` tokens as prompt tokens
+            init_tokens[i] = [t for t in init_tokens[i] if t is not None]
+
+        return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1)
+
+    def detect_language(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        num_segment_frames: int = 3000,
+    ) -> torch.Tensor:
+        """
+        Detects language from log-mel input features or encoder_outputs
+
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+                Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+                `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+                hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            num_segment_frames (`int`, *optional*, defaults to 3000):
+                The number of log-mel frames the model expects
+
+        Return:
+            A `torch.LongTensor` representing the detected language ids.
+        """
+        if input_features is None and encoder_outputs is None:
+            raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
+        elif input_features is not None and encoder_outputs is not None:
+            raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
+        elif input_features is not None:
+            inputs = {"input_features": input_features[:, :, :num_segment_frames]}
+            batch_size = input_features.shape[0]
+        elif encoder_outputs is not None:
+            inputs = {"encoder_outputs": encoder_outputs}
+            batch_size = (
+                encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+            )
+
+        generation_config = generation_config or self.generation_config
+        decoder_input_ids = (
+            torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
+            * generation_config.decoder_start_token_id
+        )
+
+        with torch.no_grad():
+            logits = self(**inputs, decoder_input_ids=decoder_input_ids, use_cache=False).logits[:, -1]
+
+        non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
+        non_lang_mask[list(generation_config.lang_to_id.values())] = False
+
+        logits[:, non_lang_mask] = -np.inf
+
+        lang_ids = logits.argmax(-1)
+
+        return lang_ids
+
+    @staticmethod
+    def _check_decoder_input_ids(kwargs):
+        decoder_input_ids = kwargs.get("decoder_input_ids", None)
+        assistant_model = kwargs.get("assistant_model", None)
+        if decoder_input_ids is not None and assistant_model is not None:
+            raise ValueError(
+                "Passing `decoder_input_ids` is deprecated. Consider passing `prompt_ids` instead.",
+            )
+
+    @staticmethod
+    def _set_num_frames(return_token_timestamps, generation_config, kwargs):
+        if return_token_timestamps:
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+            generation_config.num_frames = kwargs.pop("num_frames", None)
+
+    @staticmethod
+    def _set_thresholds_and_condition(
+        generation_config,
+        logprob_threshold,
+        compression_ratio_threshold,
+        no_speech_threshold,
+        condition_on_prev_tokens,
+    ):
+        generation_config.logprob_threshold = (
+            logprob_threshold
+            if logprob_threshold is not None
+            else getattr(generation_config, "logprob_threshold", None)
+        )
+        generation_config.compression_ratio_threshold = (
+            compression_ratio_threshold
+            if compression_ratio_threshold is not None
+            else getattr(generation_config, "compression_ratio_threshold", None)
+        )
+        generation_config.no_speech_threshold = (
+            no_speech_threshold
+            if no_speech_threshold is not None
+            else getattr(generation_config, "no_speech_threshold", None)
+        )
+        generation_config.condition_on_prev_tokens = (
+            condition_on_prev_tokens
+            if condition_on_prev_tokens is not None
+            else getattr(generation_config, "condition_on_prev_tokens", None)
+        )
+
+    @staticmethod
+    def _set_prompt_condition_type(generation_config, prompt_condition_type):
+        allowed_cond_types = ["first-segment", "all-segments"]
+
+        # default to "first-segment"
+        prompt_condition_type = prompt_condition_type or allowed_cond_types[0]
+
+        if prompt_condition_type not in allowed_cond_types:
+            raise ValueError(
+                f"`prompt_condition_type={prompt_condition_type} does not exist. Make sure to set `prompt_condition_type` to one of {', '.join(allowed_cond_types)}"
+            )
+
+        if generation_config.condition_on_prev_tokens is not True and prompt_condition_type == "all-segments":
+            raise ValueError(
+                "Make sure to set `condition_on_prev_tokens=True` when setting `prompt_condition_type='all-segments'`."
+            )
+
+        generation_config.prompt_condition_type = prompt_condition_type
+
+    @staticmethod
+    def _set_condition_on_prev_tokens(condition_on_prev_tokens, generation_config):
+        condition_on_prev_tokens = (
+            condition_on_prev_tokens
+            if condition_on_prev_tokens is not None
+            else getattr(generation_config, "condition_on_prev_tokens", False)
+        )
+        generation_config.condition_on_prev_tokens = condition_on_prev_tokens
+
+    @staticmethod
+    def _retrieve_max_frames_and_seek(batch_size, attention_mask, total_input_frames, is_shortform):
+        if batch_size > 1 and not is_shortform and attention_mask is None:
+            raise ValueError(
+                "When doing batched long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
+            )
+        elif batch_size > 1 and not is_shortform:
+            max_frames = attention_mask.sum(-1).cpu().to(torch.long)
+            seek = torch.zeros((batch_size,), dtype=torch.long)
+        else:
+            max_frames = torch.ones((batch_size,), dtype=torch.long) * total_input_frames
+            seek = torch.zeros((batch_size,), dtype=torch.long)
+
+        return max_frames, seek
+
+    def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, num_beams, device):
+        if generation_config.return_timestamps is True:
+            timestamp_processor = WhisperTimeStampLogitsProcessor(generation_config, begin_index=begin_index)
+            logits_processor = (
+                [timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor
+            )
+
+        if generation_config.suppress_tokens is not None:
+            suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device)
+            logits_processor = (
+                [suppress_tokens_processor]
+                if logits_processor is None
+                else [suppress_tokens_processor] + logits_processor
+            )
+            generation_config.suppress_tokens = None
+
+        if generation_config.begin_suppress_tokens is not None:
+            begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor(
+                generation_config.begin_suppress_tokens, begin_index=begin_index, device=device
+            )
+            logits_processor = (
+                [begin_suppress_processor]
+                if logits_processor is None
+                else [begin_suppress_processor] + logits_processor
+            )
+            generation_config.begin_suppress_tokens = None
+
+        if generation_config.no_speech_threshold is not None:
+            no_speech_detector = WhisperNoSpeechDetection(
+                no_speech_token=generation_config.no_timestamps_token_id - 1,
+                begin_index=begin_index,
+                scores_is_logprobs=num_beams > 1,
+            )
+            logits_processor = (
+                [no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor
+            )
+            no_speech_detector.set_model(self)
+
+        return logits_processor
+
+    @staticmethod
+    def _maybe_reduce_batch(input_features, seek, max_frames, cur_bsz, batch_idx_map):
+        prev_bsz = cur_bsz
+        new_batch_idx_map = []
+        for i in range(prev_bsz):
+            prev_i = batch_idx_map[i]
+            if seek[prev_i] >= max_frames[prev_i]:
+                cut_index = i + (cur_bsz - prev_bsz)
+                cur_bsz -= 1
+                input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0)
+            else:
+                # cut out index that goes away
+                new_batch_idx_map.append(prev_i)
+
+        return input_features, cur_bsz, new_batch_idx_map
+
+    @staticmethod
+    def _get_input_segment(input_features, seek, seek_num_frames, num_segment_frames, cur_bsz, batch_idx_map):
+        if input_features is None:
+            return None
+
+        segment_input = []
+        for i in range(cur_bsz):
+            prev_i = batch_idx_map[i]
+            segment_input_slice = input_features[i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i]]
+
+            if segment_input_slice.shape[-1] < num_segment_frames:
+                # pad to 3000 if necessary
+                segment_input_slice = F.pad(
+                    segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1])
+                )
+
+            segment_input.append(segment_input_slice)
+
+        segment_input = torch.cat(segment_input, dim=0)
+
+        return segment_input
+
+    @staticmethod
+    def _prepare_decoder_input_ids(
+        cur_bsz,
+        init_tokens,
+        current_segments,
+        batch_idx_map,
+        do_condition_on_prev_tokens,
+        prompt_ids,
+        generation_config,
+        config,
+        device,
+        suppress_tokens,
+        timestamp_begin,
+        kwargs,
+    ):
+        if "decoder_input_ids" in kwargs:
+            decoder_input_ids = kwargs.pop("decoder_input_ids")
+
+            return decoder_input_ids, kwargs
+
+        cut_off_length = config.max_target_positions // 2 - 1
+
+        decoder_input_ids = init_tokens[batch_idx_map]
+
+        prev_start_of_text = getattr(generation_config, "prev_sot_token_id", None)
+        if prev_start_of_text is None:
+            if suppress_tokens is not None and len(suppress_tokens) >= 2:
+                prev_start_of_text = suppress_tokens[-2]
+            else:
+                prev_start_of_text = None
+
+        if any(do_condition_on_prev_tokens) and len(current_segments[0]) > 0:
+            # according to https://github.com/openai/whisper/blob/e58f28804528831904c3b6f2c0e473f346223433/whisper/decoding.py#L609
+            active_segments = [current_segments[i] if do_condition_on_prev_tokens[i] else None for i in batch_idx_map]
+
+            for segments in active_segments:
+                for seg in segments:
+                    if len(seg["tokens"]) > 2 and seg["tokens"][-2] >= timestamp_begin:
+                        # the segment finishes with two timestamp tokens
+                        # we need to ignore the last timestamp token
+                        # see https://github.com/huggingface/transformers/pull/34537
+                        seg["tokens"] = seg["tokens"][:-1]
+
+            if prompt_ids is not None and generation_config.prompt_condition_type == "all-segments":
+                prev_ids = prompt_ids
+            else:
+                one_tensor = torch.ones((cur_bsz, 1), device=device, dtype=torch.long)
+                prev_ids = prev_start_of_text * one_tensor[0] if prev_start_of_text is not None else None
+
+            padding = "max_length" if generation_config.cache_implementation == "static" else "longest"
+
+            prev_tokens = _pad_to_max_length(
+                active_segments,
+                generation_config.pad_token_id,
+                device=device,
+                padding_side="left",
+                padding=padding,
+                bos_token_tensor=prev_ids,
+                cut_off_length=cut_off_length,
+            )
+            decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
+
+            kwargs["decoder_attention_mask"] = decoder_input_ids != generation_config.pad_token_id
+        elif prompt_ids is not None:
+            prev_tokens = prompt_ids[None].repeat(decoder_input_ids.shape[0], 1)
+            decoder_input_ids = torch.cat([prev_tokens, decoder_input_ids], dim=-1)
+            # make sure `"decoder_attention_mask"` is not passed to forward
+            kwargs.pop("decoder_attention_mask", None)
+        else:
+            # make sure `"decoder_attention_mask"` is not passed to forward
+            kwargs.pop("decoder_attention_mask", None)
+
+        return decoder_input_ids, kwargs
+
+    def _set_max_new_tokens_and_length(self, config, decoder_input_ids, generation_config):
+        max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0
+        if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions:
+            raise ValueError(
+                f"The length of `decoder_input_ids`, including special start tokens, prompt tokens, and previous tokens, is {decoder_input_ids.shape[-1]}, "
+                f" and `max_new_tokens` is {max_new_tokens}. Thus, the combined length of "
+                f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
+                f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
+                "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+                f"so that their combined length is less than {self.config.max_target_positions}."
+            )
+
+        num_initial_tokens = min(config.max_target_positions // 2 - 1, decoder_input_ids.shape[-1] - 1)
+
+        # Make sure we don't get larger than `max_length`
+        if generation_config.max_length is not None and generation_config.max_new_tokens is None:
+            max_length = min(generation_config.max_length + num_initial_tokens, config.max_target_positions)
+            logger.info(
+                f"Increase max_length from {generation_config.max_length} to {max_length} since input is conditioned on previous segment."
+            )
+        elif (
+            generation_config.max_new_tokens is not None
+            and generation_config.max_new_tokens + decoder_input_ids.shape[-1] > config.max_target_positions
+        ):
+            max_new_tokens = config.max_target_positions - decoder_input_ids.shape[-1]
+            generation_config.max_new_tokens = max_new_tokens
+
+    @staticmethod
+    def _retrieve_compression_ratio(tokens, vocab_size):
+        """Compute byte length of zlib compressed token bytes vs. byte length of raw token bytes"""
+        length = int(math.log2(vocab_size) / 8) + 1
+        token_bytes = b"".join([t.to_bytes(length, "little") for t in tokens.tolist()])
+        compression_ratio = len(token_bytes) / len(zlib.compress(token_bytes))
+
+        return compression_ratio
+
+    @staticmethod
+    def _retrieve_avg_logprobs(scores, tokens, temperature):
+        rescale_temperature = temperature if temperature > 0.0 else 1
+        scores = torch.stack(scores).to(tokens.device)
+
+        if scores.shape[0] > tokens.shape[0]:
+            scores = scores[: tokens.shape[0]]
+        else:
+            tokens = tokens[-scores.shape[0] :]
+
+        logprobs = F.log_softmax((scores * rescale_temperature).float(), dim=-1).to(scores.dtype)
+
+        # retrieve logprob of selected tokens and sum
+        # don't remove the eos token logprob! it counts in avg_logprob calculation in the original implementation
+        sum_logprobs = sum(logprobs[i][tokens[i]] for i in range(logprobs.shape[0]))
+
+        avg_logprobs = sum_logprobs / len(tokens)
+        return avg_logprobs
+
+    @staticmethod
+    def _retrieve_segment(
+        seek_sequence,
+        seek_outputs,
+        time_offset,
+        timestamp_begin,
+        seek_num_frames,
+        time_precision,
+        time_precision_features,
+        input_stride,
+        prev_idx,
+        idx,
+        return_token_timestamps,
+        decoder_input_ids,
+    ):
+        # find the predicted "end of segment" predictions of Whisper
+        # "end of segment" predictions occur whenever Whisper predicts a timestamp token
+        timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
+        single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+        timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+        timestamp_segment_indices.add_(1)
+        token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
+        idx_offset = decoder_input_ids.shape[-1]
+        device = seek_sequence.device
+
+        # If whisper predicted a "end of segment" via a timestep token, let's go ever each
+        # "end of segment" prediction and slice the decoding into segments accordingly
+        if len(timestamp_segment_indices) > 0:
+            # if the output contains two consecutive timestamp tokens
+            slices = timestamp_segment_indices.tolist()
+            segments = []
+            if single_timestamp_ending:
+                slices.append(len(seek_sequence))
+            else:
+                # we want to include the last timestamp token in the last segment to know it was no single ending
+                slices[-1] += 1
+
+            last_slice = 0
+            # Add each segment to list of all segments
+            for i, current_slice in enumerate(slices):
+                is_last_slice = i == len(slices) - 1
+                sliced_tokens = seek_sequence[last_slice:current_slice]
+                start_timestamp_pos = sliced_tokens[0] - timestamp_begin
+                idx_sliced_tokens = -1 if not is_last_slice or single_timestamp_ending else -2
+                end_timestamp_pos = sliced_tokens[idx_sliced_tokens] - timestamp_begin
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx]
+                        + start_timestamp_pos.to(torch.float32 if device.type == "mps" else torch.float64)
+                        * time_precision,
+                        "end": time_offset[prev_idx]
+                        + end_timestamp_pos.to(torch.float32 if device.type == "mps" else torch.float64)
+                        * time_precision,
+                        "tokens": sliced_tokens,
+                        "idxs": (idx_offset + last_slice, idx_offset + current_slice),
+                        "result": seek_outputs[idx],
+                    }
+                )
+                if return_token_timestamps:
+                    segments[-1]["token_timestamps"] = (
+                        token_timestamps[idx_offset + last_slice : idx_offset + current_slice] + time_offset[prev_idx]
+                    )
+                last_slice = current_slice
+
+            if single_timestamp_ending:
+                # single timestamp at the end means no speech after the last timestamp.
+                segment_offset = seek_num_frames[prev_idx]
+            else:
+                # otherwise, ignore the unfinished segment and seek to the last timestamp
+                # here we throw away all predictions after the last predicted "end of segment"
+                # since we are cutting right in the middle of an audio
+                last_timestamp_pos = seek_sequence[last_slice - 2].item() - timestamp_begin
+                segment_offset = last_timestamp_pos * input_stride
+        else:
+            # If whisper does not predict any "end of segment" token, then
+            # the whole decoding is considered a segment and we add it to the list of segments
+            timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
+            last_timestamp_pos = int(seek_num_frames[prev_idx] * time_precision_features / time_precision)
+            if timestamps.numel() > 0 and timestamps[-1] != timestamp_begin:
+                # no consecutive timestamps but it has a timestamp; use the last one.
+                last_timestamp_pos = (timestamps[-1] - timestamp_begin).to(
+                    torch.float32 if device.type == "mps" else torch.float64
+                )
+            segments = [
+                {
+                    "start": time_offset[prev_idx],
+                    "end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
+                    "tokens": seek_sequence,
+                    "idxs": (idx_offset, idx_offset + len(seek_sequence)),
+                    "result": seek_outputs[idx],
+                }
+            ]
+            if return_token_timestamps:
+                segments[-1]["token_timestamps"] = (
+                    token_timestamps[idx_offset : idx_offset + len(seek_sequence)] + time_offset[prev_idx]
+                )
+            segment_offset = seek_num_frames[prev_idx]
+
+        return segments, segment_offset
diff --git a/docs/transformers/build/lib/transformers/models/whisper/modeling_flax_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/modeling_flax_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c7e9d1c65fd384ade0931dc0bf308c9223d60e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/modeling_flax_whisper.py
@@ -0,0 +1,1704 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax whisper model."""
+
+import math
+import random
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...generation.flax_logits_process import FlaxWhisperTimeStampLogitsProcessor
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+    FlaxSequenceClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_whisper import WhisperConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+_CONFIG_FOR_DOC = "WhisperConfig"
+
+remat = nn_partitioning.remat
+
+
+def sinusoidal_embedding_init(key, shape, dtype=jnp.float_) -> jax.Array:
+    """Returns sinusoids for positional embedding"""
+    length, channels = shape
+    if channels % 2 != 0:
+        raise ValueError(
+            f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels."
+        )
+    log_timescale_increment = math.log(10000) / (channels // 2 - 1)
+    inv_timescales = jnp.exp(-log_timescale_increment * jnp.arange(channels // 2))
+    scaled_time = jnp.arange(length).reshape(-1, 1) * inv_timescales.reshape(1, -1)
+    return jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1).astype(dtype)
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its models (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`WhisperConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision
+            inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`]
+            and [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids) Whisper uses the `decoder_start_token_id` as
+            the starting token for `decoder_input_ids` generation.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not use `position_ids` in the encoder as `input_features` is always the same size and doesn't
+            use masking, but this argument is preserved for compatibility. By default the silence in the input log mel
+            spectrogram are ignored.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WHISPER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`].
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but
+            is not used. By default the silence in the input log mel spectrogram are ignored.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+WHISPER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
+            [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(numpy.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+           Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+            but it is not used. By default the silence in the input log mel spectrogram are ignored.
+        decoder_attention_mask (`numpy.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default. If you want to change padding behavior, you should modify to your needs. See diagram 1
+            in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, numpy.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxWhisperAttention(nn.Module):
+    config: WhisperConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+        self.q_proj = dense(use_bias=self.bias)
+        self.k_proj = dense(use_bias=False)
+        self.v_proj = dense(use_bias=self.bias)
+        self.out_proj = dense(use_bias=self.bias)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_target_positions), dtype="bool"), dtype="bool"
+            )
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        query_states = self.q_proj(hidden_states)
+
+        if is_cross_attention:
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask,
+                    (0, 0, mask_shift, 0),
+                    (1, 1, query_length, max_decoder_length),
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_state) -> jnp.ndarray:
+        return hidden_state.reshape(hidden_state.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask) -> Tuple[jnp.ndarray, jnp.ndarray, jnp.ndarray]:
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only
+            # attend to those key positions that have already been generated and cached, not the
+            # remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+
+        return key, value, attention_mask
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Whisper
+class FlaxWhisperEncoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class FlaxWhisperEncoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxWhisperEncoderCheckpointLayer = remat(FlaxWhisperEncoderLayer, static_argnums=(2, 3))
+            self.layers = [
+                FlaxWhisperEncoderCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.encoder_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxWhisperEncoderLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.encoder_layers)
+            ]
+        self.layerdrop = self.config.encoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Whisper
+class FlaxWhisperDecoderLayer(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.encoder_attn = FlaxWhisperAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.fc1 = nn.Dense(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
+        )
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class FlaxWhisperDecoderLayerCollection(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxWhisperDecoderCheckpointLayer = remat(FlaxWhisperDecoderLayer, static_argnums=(4, 5, 6))
+            self.layers = [
+                FlaxWhisperDecoderCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.decoder_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxWhisperDecoderLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.decoder_layers)
+            ]
+        self.layerdrop = self.config.decoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    init_cache,
+                    output_attentions,
+                    deterministic,
+                )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class FlaxWhisperEncoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.conv1 = nn.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            padding=1,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+            dtype=self.dtype,
+        )
+        self.conv2 = nn.Conv(
+            self.config.d_model,
+            kernel_size=(3,),
+            strides=2,
+            padding=1,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+            dtype=self.dtype,
+        )
+
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        self.layers = FlaxWhisperEncoderLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+        self.embed_positions = nn.Embed(
+            self.config.max_source_positions,
+            self.config.d_model,
+            dtype=self.dtype,
+            embedding_init=sinusoidal_embedding_init,
+        )
+
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        if input_features.shape[1:] != (self.config.num_mel_bins, self.config.max_source_positions * 2):
+            raise ValueError(
+                "input_features.shape[1:], must be equal to (self.config.num_mel_bins,"
+                f" self.config.max_source_positions * 2) (got {input_features.shape[1:]}, but should be"
+                f" ({self.config.num_mel_bins}, {self.config.max_source_positions * 2}))"
+            )
+
+        input_features = input_features.transpose(0, 2, 1)
+        hidden_states = jax.nn.gelu(self.conv1(input_features), approximate=False)
+        hidden_states = jax.nn.gelu(self.conv2(hidden_states), approximate=False)
+
+        embed_positions = self.embed_positions(jnp.arange(self.config.max_source_positions))
+        # freeze the sinusoidal embeddings by stopping the back-prop
+        embed_positions = jax.lax.stop_gradient(embed_positions)
+        hidden_states = hidden_states + embed_positions
+
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=None,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class FlaxWhisperDecoder(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.embed_tokens = nn.Embed(self.config.vocab_size, self.config.d_model, dtype=self.dtype)
+        self.embed_positions = nn.Embed(self.config.max_target_positions, self.config.d_model, dtype=self.dtype)
+
+        self.layers = FlaxWhisperDecoderLayerCollection(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-5)
+
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        position_ids: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        input_embeds = self.embed_tokens(input_ids)
+        position_embeds = self.embed_positions(position_ids)
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class FlaxWhisperModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.decoder = FlaxWhisperDecoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        decoder_attention_mask: jnp.ndarray,
+        decoder_position_ids: jnp.ndarray,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def _get_encoder_module(self):
+        return self.encoder
+
+    def _get_decoder_module(self):
+        return self.decoder
+
+
+class FlaxWhisperPreTrainedModel(FlaxPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix: str = "model"
+    main_input_name = "input_features"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: WhisperConfig,
+        input_shape: Tuple[int] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        if input_shape is None:
+            input_shape = (1, config.num_mel_bins, 2 * config.max_source_positions)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        decoder_input_ids = jnp.zeros((input_shape[0], 1), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartPreTrainedModel.init_cache with Bart->Whisper
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings(WHISPER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=WhisperConfig)
+    def encode(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        def _encoder_forward(module, input_features, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_features, **kwargs)
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> import jax.numpy as jnp
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features
+
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+
+        >>> decoder_input_ids = jnp.ones((input_features.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        decoder_input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # prepare decoder inputs
+        if decoder_position_ids is None:
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                batch_size, sequence_length = decoder_input_ids.shape
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+
+
+@add_start_docstrings(
+    "The bare Whisper Model transformer outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class FlaxWhisperModel(FlaxWhisperPreTrainedModel):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    module_class = FlaxWhisperModule
+
+
+append_call_sample_docstring(FlaxWhisperModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxWhisperForConditionalGenerationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.model = FlaxWhisperModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+    def _get_encoder_module(self):
+        return self.model.encoder
+
+    def _get_decoder_module(self):
+        return self.model.decoder
+
+    def __call__(
+        self,
+        input_features,
+        decoder_input_ids,
+        decoder_attention_mask: jnp.ndarray = None,
+        decoder_position_ids: jnp.ndarray = None,
+        position_ids: jnp.ndarray = None,
+        attention_mask: jnp.ndarray = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.decoder.embed_tokens.variables["params"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with a language modeling head.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForConditionalGeneration(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForConditionalGenerationModule
+    dtype: jnp.dtype = jnp.float32
+
+    @add_start_docstrings(WHISPER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=WhisperConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: dict = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+        >>> input_features = inputs.input_features
+        >>> encoder_outputs = model.encode(input_features=input_features)
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            if decoder_attention_mask is not None:
+                decoder_position_ids = (decoder_attention_mask.cumsum(-1) * decoder_attention_mask) - 1
+            else:
+                decoder_position_ids = jnp.broadcast_to(
+                    jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+                )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length), dtype="i4")
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxWhisperAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.decoder.embed_tokens.variables["params"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+
+            return lm_logits, outputs
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    def generate(
+        self,
+        input_features,
+        generation_config=None,
+        logits_processor=None,
+        return_timestamps=None,
+        task=None,
+        language=None,
+        is_multilingual=None,
+        **kwargs,
+    ):
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            generation_config.return_timestamps = return_timestamps
+
+        if task is not None:
+            generation_config.task = task
+
+        if is_multilingual is not None:
+            generation_config.is_multilingual = is_multilingual
+
+        if language is not None:
+            generation_config.language = language
+
+        if kwargs is not None and "decoder_input_ids" in kwargs:
+            decoder_input_length = len(kwargs["decoder_input_ids"])
+        else:
+            decoder_input_length = 1
+
+        forced_decoder_ids = []
+
+        if hasattr(generation_config, "is_multilingual") and generation_config.is_multilingual:
+            if hasattr(generation_config, "language"):
+                forced_decoder_ids.append((1, generation_config.lang_to_id[generation_config.language]))
+            else:
+                forced_decoder_ids.append((1, None))
+
+            if hasattr(generation_config, "task"):
+                forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+            else:
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))
+
+        if (
+            hasattr(generation_config, "return_timestamps") and generation_config.return_timestamps
+        ) or return_timestamps:
+            logits_processor = [
+                FlaxWhisperTimeStampLogitsProcessor(generation_config, self.config, decoder_input_length)
+            ]
+        else:
+            if forced_decoder_ids and forced_decoder_ids[-1][0] != generation_config.no_timestamps_token_id:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if len(forced_decoder_ids) > 0:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor=logits_processor,
+            **kwargs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> from transformers import WhisperProcessor, FlaxWhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np")
+    >>> input_features = inputs.input_features
+    >>> generated_ids = model.generate(input_ids=input_features)
+    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    >>> transcription
+    ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForConditionalGeneration, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_CONDITIONAL_GENERATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+class FlaxWhisperForAudioClassificationModule(nn.Module):
+    config: WhisperConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self) -> None:
+        self.encoder = FlaxWhisperEncoder(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.config.is_encoder_decoder = False
+        num_layers = self.config.num_hidden_layers + 1
+        if self.config.use_weighted_layer_sum:
+            self.layer_weights = jnp.repeat(1 / num_layers, num_layers)
+        self.projector = nn.Dense(self.config.classifier_proj_size, dtype=self.dtype)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_features,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states: bool = True,
+        return_dict: bool = True,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = jnp.stack(encoder_outputs, axis=1)
+            norm_weights = jax.nn.softmax(self.layer_weights, axis=-1)
+            hidden_states = jnp.sum(hidden_states * jnp.reshape(norm_weights, [-1, 1, 1]), axis=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = jnp.mean(hidden_states, axis=1)
+
+        logits = self.classifier(pooled_output)
+
+        if not return_dict:
+            return (logits,) + encoder_outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The Whisper Model with an audio classification head on top.", WHISPER_START_DOCSTRING)
+class FlaxWhisperForAudioClassification(FlaxWhisperPreTrainedModel):
+    module_class = FlaxWhisperForAudioClassificationModule
+    dtype: jnp.dtype = jnp.float32
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_features = jnp.zeros(input_shape, dtype="f4")
+        input_features = input_features.at[(..., -1)].set(self.config.eos_token_id)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_features=input_features,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_features: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        dropout_rng: PRNGKey = None,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_features=jnp.array(input_features, dtype="f4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            rngs=rngs,
+        )
+
+
+FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING = r"""
+    Returns:
+
+    Transcription example:
+
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import AutoFeatureExtractor, FlaxWhisperForAudioClassification
+    >>> from datasets import load_dataset
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+    >>> model = FlaxWhisperForAudioClassification.from_pretrained(
+    ...     "sanchit-gandhi/whisper-medium-fleurs-lang-id", from_pt=True
+    ... )
+    >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True, trust_remote_code=True)
+
+    >>> sample = next(iter(ds))
+
+    >>> inputs = feature_extractor(
+    ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="np"
+    ... )
+    >>> input_features = inputs.input_features
+
+    >>> logits = model(input_features).logits
+
+    >>> predicted_class_ids = jnp.argmax(logits).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    'af_za'
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxWhisperForAudioClassification, WHISPER_INPUTS_DOCSTRING + FLAX_WHISPER_AUDIO_CLASSIFICATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxWhisperForAudioClassification, output_type=FlaxSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+__all__ = [
+    "FlaxWhisperForConditionalGeneration",
+    "FlaxWhisperModel",
+    "FlaxWhisperPreTrainedModel",
+    "FlaxWhisperForAudioClassification",
+]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/modeling_tf_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/modeling_tf_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe49551ef9448206f149765ec3bd0a9d319245d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/modeling_tf_whisper.py
@@ -0,0 +1,1761 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow Whisper model."""
+
+from __future__ import annotations
+
+import math
+import random
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...generation.configuration_utils import GenerationConfig
+from ...generation.tf_logits_process import TFLogitsProcessorList
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFSeq2SeqLMOutput,
+    TFSeq2SeqModelOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_whisper import WhisperConfig
+from .tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "WhisperConfig"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+def sinusoidal_embedding_init(shape, dtype=tf.float32) -> tf.Tensor:
+    """Returns sinusoids for positional embedding"""
+    length, channels = shape
+    if channels % 2 != 0:
+        raise ValueError(
+            f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels."
+        )
+    log_timescale_increment = math.log(10000) / (channels // 2 - 1)
+    inv_timescales = tf.exp(-log_timescale_increment * tf.range(channels // 2, dtype=tf.float32))
+    scaled_time = tf.reshape(tf.range(length, dtype=tf.float32), (-1, 1)) * tf.reshape(inv_timescales, (1, -1))
+    return tf.cast(tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1), dtype)
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
+    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
+    start_tokens = tf.fill(
+        (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
+    )
+    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100,
+        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
+        shifted_input_ids,
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz = input_ids_shape[0]
+    tgt_len = input_ids_shape[1]
+    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
+    mask_cond = tf.range(shape_list(mask)[-1])
+
+    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
+
+    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+class TFWhisperPositionalEmbedding(keras.layers.Layer):
+    def __init__(
+        self,
+        num_positions: int,
+        embedding_dim: int,
+        padding_idx: Optional[int] = None,
+        embedding_initializer=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_positions = num_positions
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.embedding_initializer = keras.initializers.get(embedding_initializer)
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            name="weight",
+            shape=[self.num_positions, self.embedding_dim],
+            initializer=self.embedding_initializer,
+            trainable=True,
+        )
+        super().build(input_shape)
+
+    def call(self, input_ids, past_key_values_length=0):
+        past_key_values_length = tf.cast(past_key_values_length, tf.int32)
+        gather_indices = tf.range(tf.shape(input_ids)[1], delta=1) + past_key_values_length
+        return tf.gather(self.weight, gather_indices)
+
+
+class TFWhisperAttention(keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=False, name="k_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention._shape with BART->whisper
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    # Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention.call with BART->whisper
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextEncoderLayer with Speech2Text->Whisper
+class TFWhisperEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFWhisperAttention(
+            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
+        )
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self, hidden_states: tf.Tensor, attention_mask: tf.Tensor, layer_head_mask: tf.Tensor, training: bool = False
+    ):
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, self_attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            training=training,
+        )
+
+        tf.debugging.assert_equal(
+            shape_list(hidden_states),
+            shape_list(residual),
+            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
+        )
+
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, self_attn_weights
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.encoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.speech_to_text.modeling_tf_speech_to_text.TFSpeech2TextDecoderLayer with Speech2Text->Whisper
+class TFWhisperDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+
+        self.self_attn = TFWhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="self_attn",
+            is_decoder=True,
+        )
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.encoder_attn = TFWhisperAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="encoder_attn",
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
+        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
+        training=False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`tf.Tensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                `(decoder_attention_heads,)`
+            cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+                `(decoder_attention_heads,)`
+            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            training=training,
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                training=training,
+            )
+            hidden_states = self.dropout(hidden_states, training=training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            cross_attn_weights,
+            present_key_value,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.decoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+class TFWhisperPreTrainedModel(TFPreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix = "model"
+    main_input_name = "input_features"
+
+    def _get_feat_extract_output_lengths(self, input_lengths: tf.Tensor) -> int:
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        return {
+            self.main_input_name: tf.random.uniform(
+                [1, self.config.num_mel_bins, self.config.max_source_positions * 2 - 1], dtype=tf.float32
+            ),
+            "decoder_input_ids": tf.constant([[1, 3]], dtype=tf.int32),
+        }
+
+    @property
+    def input_signature(self):
+        return {
+            "input_features": tf.TensorSpec((None, self.config.num_mel_bins, None), tf.float32, name="input_features"),
+            "decoder_input_ids": tf.TensorSpec((None, None), tf.int32, name="decoder_input_ids"),
+            "decoder_attention_mask": tf.TensorSpec((None, None), tf.int32, name="decoder_attention_mask"),
+        }
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`WhisperConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained
+            by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.*
+            via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a
+            tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
+        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SpeechToTextTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            SpeechToText uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read
+            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@keras_serializable
+class TFWhisperEncoder(keras.layers.Layer):
+    config_class = WhisperConfig
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`TFWhisperEncoderLayer`].
+
+    Args:
+        config: WhisperConfig
+        embed_tokens (TFWhisperEmbedding): output embedding
+    """
+
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layerdrop = config.encoder_layerdrop
+
+        self.embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
+
+        # Padding is added in call() to match the PyTorch implementation
+        self.conv1 = keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=1, padding="valid", name="conv1")
+        self.conv2 = keras.layers.Conv1D(self.embed_dim, kernel_size=3, strides=2, padding="valid", name="conv2")
+
+        self.embed_positions = TFWhisperPositionalEmbedding(
+            num_positions=self.max_source_positions,
+            embedding_dim=self.embed_dim,
+            embedding_initializer=sinusoidal_embedding_init,
+            name="embed_positions",
+        )
+        self.embed_positions.trainable = False
+
+        self.encoder_layers = [TFWhisperEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+        self.dropout = keras.layers.Dropout(config.dropout)
+
+    @unpack_inputs
+    def call(
+        self,
+        input_features=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        r"""
+        Args:
+            input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features,
+                padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`]
+            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # TF 2.0 layers can't use channels first format when running on CPU.
+        input_features = tf.transpose(input_features, perm=(0, 2, 1))
+        input_features = tf.pad(input_features, [[0, 0], [1, 1], [0, 0]])
+        inputs_embeds = keras.activations.gelu(self.conv1(input_features))
+        inputs_embeds = tf.pad(inputs_embeds, [[0, 0], [1, 1], [0, 0]])
+        inputs_embeds = keras.activations.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = tf.transpose(inputs_embeds, perm=(0, 1, 2))
+
+        embed_pos = self.embed_positions(input_ids=tf.zeros((1, self.max_source_positions), dtype=tf.int32))
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(head_mask)[0],
+                len(self.encoder_layers),
+                message=(
+                    f"The head_mask should be specified for {len(self.encoder_layers)} layers, but it is for"
+                    f" {shape_list(head_mask)[0]}."
+                ),
+            )
+
+        for idx, encoder_layer in enumerate(self.encoder_layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):  # skip the layer
+                continue
+
+            hidden_states, attn = encoder_layer(
+                hidden_states,
+                None,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                training=training,
+            )
+
+            if output_attentions:
+                all_attentions += (attn,)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "conv1", None) is not None:
+            with tf.name_scope(self.conv1.name):
+                self.conv1.build([None, None, self.num_mel_bins])
+        if getattr(self, "conv2", None) is not None:
+            with tf.name_scope(self.conv2.name):
+                self.conv2.build([None, None, self.embed_dim])
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "encoder_layers", None) is not None:
+            for layer in self.encoder_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFWhisperDecoder(keras.layers.Layer):
+    config_class = WhisperConfig
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFWhisperDecoderLayer`]
+
+    Args:
+        config: WhisperConfig
+    """
+
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.d_model,
+            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
+            name="embed_tokens",
+        )
+        self.embed_positions = TFWhisperPositionalEmbedding(
+            self.max_target_positions, config.d_model, name="embed_positions"
+        )
+
+        self.decoder_layers = [TFWhisperDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        batch_size, seq_len = input_shape[0], input_shape[1]
+
+        combined_attention_mask = tf.cond(
+            tf.math.greater(seq_len, 1),
+            lambda: _make_causal_mask(input_shape, past_key_values_length=past_key_values_length),
+            lambda: _expand_mask(tf.ones((batch_size, seq_len + past_key_values_length)), tgt_len=seq_len),
+        )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        r"""
+        Args:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+                range `[0, config.max_position_embeddings - 1]`.
+            encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(tf.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+                `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+                `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = tf.shape(input_ids)
+            input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = tf.shape(past_key_values[0][0])[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
+
+        # embed positions
+        filled_past_positions = past_key_values_length if position_ids is None else position_ids[0, -1]
+        positions = self.embed_positions(input_ids, past_key_values_length=filled_past_positions)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]:
+            if attn_mask is not None:
+                tf.debugging.assert_equal(
+                    shape_list(attn_mask)[0],
+                    len(self.decoder_layers),
+                    message=(
+                        f"The {attn_mask_name} should be specified for {len(self.decoder_layers)} layers, but it is"
+                        f" for {shape_list(attn_mask)[0]}."
+                    ),
+                )
+
+        for idx, decoder_layer in enumerate(self.decoder_layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_value=past_key_value,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "decoder_layers", None) is not None:
+            for layer in self.decoder_layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@add_start_docstrings(
+    "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+@keras_serializable
+class TFWhisperMainLayer(keras.layers.Layer):
+    config_class = WhisperConfig
+
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.encoder = TFWhisperEncoder(config, name="encoder")
+        self.decoder = TFWhisperDecoder(config, name="decoder")
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_features=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        decoder_position_ids=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+         ```python
+         >>> import tensorflow as tf
+         >>> from transformers import TFWhisperModel, AutoFeatureExtractor
+         >>> from datasets import load_dataset
+
+         >>> model = TFWhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf")
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 512]
+         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                training=training,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
+            encoder_outputs = TFBaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return TFSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+@add_start_docstrings(
+    "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class TFWhisperModel(TFWhisperPreTrainedModel):
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.model = TFWhisperMainLayer(config, name="model")
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def decoder(self):
+        return self.model.decoder
+
+    def encoder(self):
+        return self.model.encoder
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+         ```python
+         >>> import tensorflow as tf
+         >>> from transformers import TFWhisperModel, AutoFeatureExtractor
+         >>> from datasets import load_dataset
+
+         >>> model = TFWhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf")
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 512]
+         ```"""
+        outputs = self.model(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+
+        return TFSeq2SeqModelOutput(
+            last_hidden_state=output.last_hidden_state,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+@add_start_docstrings(
+    "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
+    WHISPER_START_DOCSTRING,
+)
+class TFWhisperForConditionalGeneration(TFWhisperPreTrainedModel, TFCausalLanguageModelingLoss):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"encoder.version",
+        r"decoder.version",
+        r"proj_out.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        r"proj_out.weight",
+    ]
+
+    def __init__(self, config: WhisperConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = TFWhisperMainLayer(config, name="model")
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> keras.layers.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        return new_embeddings
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @unpack_inputs
+    def call(
+        self,
+        input_features: TFModelInputType | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[Union[np.ndarray, tf.Tensor]]] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoProcessor, TFWhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="tf")
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(input_features=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_features,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        decoder_last_hidden_state = outputs[0]
+        # Decoder and encoder embeddings are tied
+        lm_logits = tf.matmul(decoder_last_hidden_state, self.get_output_embeddings().weights, transpose_b=True)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSeq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def generate(
+        self,
+        inputs: Optional[tf.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[TFLogitsProcessorList] = None,
+        seed: Optional[List[int]] = None,
+        return_timestamps: Optional[bool] = None,
+        task: Optional[str] = None,
+        language: Optional[str] = None,
+        is_multilingual: Optional[bool] = None,
+        prompt_ids: Optional[tf.Tensor] = None,
+        return_token_timestamps=None,
+        **kwargs,
+    ):
+        r"""
+        Generates sequences of token ids for models with a language modeling head.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate, e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](../generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            inputs (`tf.Tensor` of varying shape depending on the modality, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If unset the method
+                initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should of in
+                the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`,
+                `input_values`, `input_features`, or `pixel_values`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            seed (`List[int]`, *optional*):
+                Random seed to control sampling, containing two integers, used when `do_sample` is `True`. See the
+                `seed` argument from stateless functions in `tf.random`.
+            return_timestamps (`bool`, *optional*):
+                Whether to return the timestamps with the text. This enables the `TFWhisperTimestampsLogitsProcessor`.
+            task (`str`, *optional*):
+                Task to use for generation, either "translate" or "transcribe". The `model.config.forced_decoder_ids`
+                will be updated accordingly.
+            language (`str`, *optional*):
+                Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`. You can
+                find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
+            is_multilingual (`bool`, *optional*):
+                Whether or not the model is multilingual.
+            prompt_ids (`tf.Tensor`, *optional*):
+                Rank-1 tensor of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
+                provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
+                transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
+                correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
+            return_token_timestamps (`bool`, *optional*):
+                Whether to return token-level timestamps with the text. This can be used with or without the
+                `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
+                words.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
+                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
+
+        Return:
+            [`~utils.ModelOutput`] or `tf.Tensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `tf.Tensor`.
+
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation.TFSampleDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation.TFBeamSampleDecoderOnlyOutput`]
+
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~utils.ModelOutput`] types are:
+
+                    - [`~generation.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation.TFSampleEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation.TFBeamSampleEncoderDecoderOutput`]
+
+        """
+        if generation_config is None:
+            generation_config = self.generation_config
+
+        if return_timestamps is not None:
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You are trying to return timestamps, but the generation config is not properly set. "
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                )
+
+            generation_config.return_timestamps = return_timestamps
+        else:
+            generation_config.return_timestamps = False
+
+        if language is not None:
+            language = language.lower()
+            generation_config.language = language
+        if task is not None:
+            generation_config.task = task
+
+        forced_decoder_ids = None
+
+        # Legacy code for backward compatibility
+        if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
+            forced_decoder_ids = self.config.forced_decoder_ids
+        elif (
+            hasattr(self.generation_config, "forced_decoder_ids")
+            and self.generation_config.forced_decoder_ids is not None
+        ):
+            forced_decoder_ids = self.generation_config.forced_decoder_ids
+        else:
+            forced_decoder_ids = kwargs.get("forced_decoder_ids", None)
+
+        if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None):
+            forced_decoder_ids = []
+            if hasattr(generation_config, "language"):
+                if generation_config.language in generation_config.lang_to_id.keys():
+                    language_token = generation_config.language
+                elif generation_config.language in TO_LANGUAGE_CODE.keys():
+                    language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>"
+                elif generation_config.language in TO_LANGUAGE_CODE.values():
+                    language_token = f"<|{generation_config.language}|>"
+                else:
+                    is_language_code = len(generation_config.language) == 2
+                    raise ValueError(
+                        f"Unsupported language: {generation_config.language}. Language should be one of:"
+                        f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                    )
+                if language_token not in generation_config.lang_to_id:
+                    raise ValueError(
+                        f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                        "(You should just add it to the generation config)"
+                    )
+                forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
+            else:
+                forced_decoder_ids.append((1, None))  # automatically detect the language
+
+            if hasattr(generation_config, "task"):
+                if generation_config.task in TASK_IDS:
+                    forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task]))
+                else:
+                    raise ValueError(
+                        f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`"
+                    )
+            elif hasattr(generation_config, "task_to_id"):
+                forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"]))  # defaults to transcribe
+            if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps:
+                idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1
+                forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id))
+
+        if forced_decoder_ids is not None:
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        if prompt_ids is not None:
+            if kwargs.get("decoder_start_token_id") is not None:
+                raise ValueError(
+                    "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten."
+                )
+            prompt_ids = prompt_ids.tolist()
+            decoder_start_token_id, *text_prompt_ids = prompt_ids
+            # Slicing the text prompt ids in a manner consistent with the OpenAI implementation
+            # to accommodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599)
+            text_prompt_ids = text_prompt_ids[-self.config.max_length // 2 - 1 :]
+            # Set the decoder_start_token_id to <|startofprev|>
+            kwargs.update({"decoder_start_token_id": decoder_start_token_id})
+
+            # Update the max generation length to include the prompt
+            specified_max_length = kwargs.pop("max_new_tokens", None) or kwargs.pop("max_length", None)
+            default_max_length = generation_config.max_new_tokens or generation_config.max_length
+            non_prompt_max_length = specified_max_length or default_max_length
+            kwargs["max_new_tokens"] = non_prompt_max_length + len(text_prompt_ids)
+
+            # Reformat the forced_decoder_ids to incorporate the prompt
+            non_prompt_forced_decoder_ids = (
+                kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids
+            )
+            forced_decoder_ids = [
+                *text_prompt_ids,
+                generation_config.decoder_start_token_id,
+                *[token for _rank, token in non_prompt_forced_decoder_ids],
+            ]
+            forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
+            generation_config.forced_decoder_ids = forced_decoder_ids
+
+        # TODO: Implement `WhisperTimeStampLogitsProcessor`.
+        if generation_config.return_timestamps:
+            # logits_processor = [TFWhisperTimeStampLogitsProcessor(generation_config)]
+            raise ValueError("`TFWhisperForConditionalGeneration` doesn't support returning the timestamps yet.")
+
+        if return_token_timestamps:
+            kwargs["output_attentions"] = True
+            kwargs["return_dict_in_generate"] = True
+
+            if getattr(generation_config, "task", None) == "translate":
+                logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
+            if not hasattr(generation_config, "alignment_heads"):
+                raise ValueError(
+                    "Model generation config has no `alignment_heads`, token-level timestamps not available. "
+                    "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config."
+                )
+
+        outputs = super().generate(
+            inputs,
+            generation_config,
+            logits_processor,
+            **kwargs,
+        )
+
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            outputs["token_timestamps"] = self._extract_token_timestamps(outputs, generation_config.alignment_heads)
+
+        return outputs
+
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+
+        return TFSeq2SeqLMOutput(
+            logits=output.logits,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        use_cache=None,
+        encoder_outputs=None,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        if decoder_attention_mask is not None:  # xla
+            decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
+        elif past_key_values is not None:  # no xla + past
+            decoder_position_ids = past_key_values[0][0].shape[2]
+        else:  # no xla + no past
+            decoder_position_ids = tf.range(decoder_input_ids.shape[1])
+        decoder_position_ids = tf.broadcast_to(decoder_position_ids, decoder_input_ids.shape)
+
+        return {
+            "input_features": None,  # Needs to be passed to make Keras.layer.__call__ happy
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "use_cache": use_cache,
+            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_position_ids": decoder_position_ids,
+        }
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+__all__ = ["TFWhisperForConditionalGeneration", "TFWhisperModel", "TFWhisperPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/modeling_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/modeling_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a454839c0d78affa10d69780291b7fa57ec9c4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/modeling_whisper.py
@@ -0,0 +1,2183 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Whisper model."""
+
+import math
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    SequenceClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_whisper import WhisperConfig
+from .generation_whisper import WhisperGenerationMixin
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+if is_flash_attn_available():
+    from ...modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 1
+
+_CONFIG_FOR_DOC = "WhisperConfig"
+_CHECKPOINT_FOR_DOC = "openai/whisper-tiny"
+
+
+def sinusoids(length: int, channels: int, max_timescale: float = 10000) -> torch.Tensor:
+    """Returns sinusoids for positional embedding"""
+    if channels % 2 != 0:
+        raise ValueError(
+            f"Number of channels has to be divisible by 2 for sinusoidal positional embeddings, got {channels} channels."
+        )
+    log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
+    inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2))
+    scaled_time = torch.arange(length).view(-1, 1) * inv_timescales.view(1, -1)
+    return torch.cat([scaled_time.sin(), scaled_time.cos()], dim=1)
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+class WhisperPositionalEmbedding(nn.Embedding):
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__(num_positions, embedding_dim)
+
+    def forward(self, input_ids, past_key_values_length=0, position_ids=None):
+        if position_ids is None:
+            return self.weight[past_key_values_length : past_key_values_length + input_ids.shape[1]]
+        else:
+            return self.weight[position_ids]
+
+
+class WhisperAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        layer_idx: Optional[int] = None,
+        config: Optional[WhisperConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        if layer_idx is None and is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+        self.layer_idx = layer_idx
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention._shape with BART->whisper
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_probs, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, past_key_value
+
+
+class WhisperFlashAttention2(WhisperAttention):
+    """
+    Whisper flash attention module. This module inherits from `WhisperAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if isinstance(past_key_value, StaticCache):
+            raise ValueError(
+                "The `static` cache implementation is not compatible with `attn_implementation='flash_attention_2'`. "
+                "Use `attn_implementation='sdpa'` in the meantime, and open an issue at https://github.com/huggingface/transformers"
+            )
+        # WhisperFlashAttention2 attention does not support output_attentions
+        if output_attentions:
+            raise ValueError("WhisperFlashAttention2 attention does not support output_attentions")
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim))
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]
+        #  We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view.
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, : key_states.shape[-2]]
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            causal_mask,
+            tgt_len,
+            dropout=self.dropout if self.training else 0.0,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1)
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class WhisperSdpaAttention(WhisperAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "WhisperModel is using WhisperSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                past_key_value.is_updated[self.layer_idx] = True
+                past_key_value = past_key_value.cross_attention_cache
+            else:
+                past_key_value = past_key_value.self_attention_cache
+
+        # use key_value_states if cross attention
+        current_states = key_value_states if key_value_states is not None else hidden_states
+        if is_cross_attention and past_key_value and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value.key_cache[self.layer_idx]
+            value_states = past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self._shape(self.k_proj(current_states), -1, bsz)
+            value_states = self._shape(self.v_proj(current_states), -1, bsz)
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+        is_causal = True if self.is_causal and causal_mask is None and tgt_len > 1 else False
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+WHISPER_ATTENTION_CLASSES = {
+    "eager": WhisperAttention,
+    "flash_attention_2": WhisperFlashAttention2,
+    "sdpa": WhisperSdpaAttention,
+}
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Whisper, MBART->WHISPER
+class WhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WhisperDecoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            layer_idx=layer_idx,
+            config=config,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+            config=config,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 1 of present_key_value tuple
+            present_key_value = (present_key_value, cross_attn_present_key_value)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class WhisperPreTrainedModel(PreTrainedModel):
+    config_class = WhisperConfig
+    base_model_prefix = "model"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, WhisperEncoder):
+            module.embed_positions.weight.copy_(sinusoids(*module.embed_positions.weight.shape))
+        elif isinstance(module, WhisperForAudioClassification):
+            if self.config.use_weighted_layer_sum:
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+
+        return input_lengths
+
+
+WHISPER_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`WhisperConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+WHISPER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing *SpecAugment* data augmentation on padding token indices. Mask values selected in
+            `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Whisper uses the `decoder_start_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read
+            [`modeling_whisper._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in [the BART
+            paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+            four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
+            in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
+            when `config.use_cache=True`
+
+            Two formats are allowed:
+            - An [`~cache_utils.EncoderDecoderCache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
+            in the correct position and to infer the complete sequence length.
+"""
+
+WHISPER_ENCODER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+            Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
+            loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+            the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+            [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+            tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class WhisperEncoder(WhisperPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`WhisperEncoderLayer`].
+
+    Args:
+        config: WhisperConfig
+    """
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)
+
+        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
+        self.embed_positions.requires_grad_(False)
+
+        self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
+                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
+            attention_mask (`torch.Tensor`)`, *optional*):
+                Whisper does not support masking of the `input_features`, this argument is preserved for compatibility,
+                but it is not used. By default the silence in the input log mel spectrogram are ignored.
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class WhisperDecoder(WhisperPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`]
+
+    Args:
+        config: WhisperConfig
+    """
+
+    main_input_name = "input_ids"
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
+
+        self.layers = nn.ModuleList(
+            [WhisperDecoderLayer(config, layer_idx) for layer_idx in range(config.decoder_layers)]
+        )
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        position_ids=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`EncoderDecoderCache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+                Pre-computed hidden-states that can be used to speed up auto-regressive (sequential) decoding. There are
+                four sets of pre-computed hidden-states: key and values states in the self-attention blocks (2) and
+                in the cross-attention blocks (2). The `past_key_values` are returned when `use_cache=True` is passed or
+                when `config.use_cache=True`
+
+                Two formats are allowed:
+                - An [`~cache_utils.EncoderDecoderCache`] instance;
+                - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+                `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        return_legacy_cache = False
+        return_self_attention_cache = False
+        if use_cache or past_key_values is not None:
+            if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
+                return_self_attention_cache = True
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            elif not isinstance(past_key_values, EncoderDecoderCache):
+                return_legacy_cache = True
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. "
+                    "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                    "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = 0
+        if cache_position is not None:
+            past_key_values_length = cache_position[0]
+        elif past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + input_shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0).repeat(input_shape[0], 1)
+
+        # embed positions
+        if input_ids is not None:
+            positions = self.embed_positions(
+                input_ids, past_key_values_length=past_key_values_length, position_ids=position_ids
+            )
+        else:
+            positions = self.embed_positions(
+                inputs_embeds, past_key_values_length=past_key_values_length, position_ids=position_ids
+            )
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            past_key_values.self_attention_cache if past_key_values is not None else None,
+            output_attentions,
+        )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    encoder_hidden_states,
+                    None,  # encoder attention mask
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,  # past_key_value
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_values if use_cache else None,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = past_key_values if use_cache else None
+        if return_self_attention_cache:
+            next_cache = past_key_values.self_attention_cache
+        if return_legacy_cache:
+            next_cache = past_key_values.to_legacy_cache()
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@add_start_docstrings(
+    "The bare Whisper Model outputting raw hidden-states without any specific head on top.",
+    WHISPER_START_DOCSTRING,
+)
+class WhisperModel(WhisperPreTrainedModel):
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        self.decoder = WhisperDecoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training.
+        """
+        self.encoder._freeze_parameters()
+
+    def _mask_input_features(
+        self,
+        input_features: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return input_features
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, hidden_size, sequence_length = input_features.size()
+
+        if self.config.mask_time_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along time axis
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=input_features.device, dtype=torch.bool)
+            mask_time_indices = mask_time_indices[:, None].expand(-1, hidden_size, -1)
+            input_features[mask_time_indices] = 0
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=input_features.device, dtype=torch.bool)
+            input_features[mask_feature_indices] = 0
+
+        return input_features
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+         ```python
+         >>> import torch
+         >>> from transformers import AutoFeatureExtractor, WhisperModel
+         >>> from datasets import load_dataset
+
+         >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 512]
+         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            input_features = self._mask_input_features(input_features, attention_mask=attention_mask)
+
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Whisper Model with a language modeling head. Can be used for automatic speech recognition.",
+    WHISPER_START_DOCSTRING,
+)
+class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedModel):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["proj_out.weight"]
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.model = WhisperModel(config)
+        self.proj_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.max_target_positions = config.max_target_positions
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.proj_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training.
+        """
+        self.model.encoder._freeze_parameters()
+
+    @add_start_docstrings_to_model_forward(WHISPER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Union[EncoderDecoderCache, Tuple[torch.FloatTensor]]] = None,
+        decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+        decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`. `sequence_length` should be smaller than or equal to `config.max_target_positions`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(inputs=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if labels.shape[1] > self.max_target_positions:
+                raise ValueError(
+                    f"Labels' sequence length {labels.shape[1]} cannot exceed the maximum allowed length of {self.max_target_positions} tokens."
+                )
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            decoder_position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.proj_out(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class WhisperDecoderWrapper(WhisperPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        config.is_encoder_decoder = False
+        self.decoder = WhisperDecoder(config)
+
+    def get_input_embeddings(self):
+        return self.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.decoder.embed_tokens = value
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+@add_start_docstrings(
+    """
+    Whisper decoder with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
+    WHISPER_START_DOCSTRING,
+)
+class WhisperForCausalLM(WhisperPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["proj_out.weight"]
+    main_input_name = "input_ids"
+
+    def __init__(self, config):
+        super().__init__(config)
+        config.is_encoder_decoder = False
+        self.model = WhisperDecoderWrapper(config)
+
+        self.proj_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.proj_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.proj_out = new_embeddings
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_outputs  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains
+                pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+                blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If
+                `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the cache
+                in the correct position and to infer the complete sequence length.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import WhisperForCausalLM, WhisperForConditionalGeneration, WhisperProcessor
+        >>> import torch
+        >>> from datasets import load_dataset
+
+        >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
+
+        >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> sample = ds[0]["audio"]
+        >>> input_features = processor(
+        ...     sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
+        ... ).input_features
+
+        >>> predicted_ids = model.generate(input_features, assistant_model=assistant_model)
+
+        >>> # decode token ids to text
+        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # If the user passed a tuple or `BaseModelOutput` for encoder_outputs, we extract only the hidden states
+        if isinstance(encoder_outputs, (BaseModelOutput, tuple, list)):
+            encoder_outputs = encoder_outputs[0]
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_outputs,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        logits = self.proj_out(outputs[0])
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    Whisper Encoder Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
+    like SUPERB Keyword Spotting.
+    """,
+    WHISPER_ENCODER_INPUTS_DOCSTRING,
+)
+class WhisperForAudioClassification(WhisperPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.encoder = WhisperEncoder(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the Whisper encoder so that its parameters will
+        not be updated during training. Only the projection layers and classification head will be updated.
+        """
+        self.encoder._freeze_parameters()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.encoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.encoder.set_input_embeddings(value)
+
+    @add_start_docstrings_to_model_forward(WHISPER_ENCODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, WhisperForAudioClassification
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+        >>> model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
+
+        >>> ds = load_dataset("google/fleurs", "all", split="validation", streaming=True)
+        >>> sample = next(iter(ds))
+
+        >>> inputs = feature_extractor(
+        ...     sample["audio"]["array"], sampling_rate=sample["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_features = inputs.input_features
+
+        >>> with torch.no_grad():
+        ...     logits = model(input_features).logits
+
+        >>> predicted_class_ids = torch.argmax(logits).item()
+        >>> predicted_label = model.config.id2label[predicted_class_ids]
+        >>> predicted_label
+        'Afrikaans'
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if self.config.use_weighted_layer_sum:
+            output_hidden_states = True
+        elif output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_features,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = encoder_outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = encoder_outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        pooled_output = hidden_states.mean(dim=1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # move labels to correct device to enable PP
+            labels = labels.to(logits.device)
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + encoder_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+__all__ = [
+    "WhisperForCausalLM",
+    "WhisperForConditionalGeneration",
+    "WhisperModel",
+    "WhisperPreTrainedModel",
+    "WhisperForAudioClassification",
+]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/processing_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/processing_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5f703c6cae070b574a13439ffc64674fc294d2b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/processing_whisper.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Whisper
+"""
+
+from ...processing_utils import ProcessorMixin
+
+
+class WhisperProcessor(ProcessorMixin):
+    r"""
+    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
+    processor.
+
+    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
+    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`WhisperFeatureExtractor`):
+            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`WhisperTokenizer`):
+            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "WhisperFeatureExtractor"
+    tokenizer_class = "WhisperTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
+        argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
+        information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors)
+
+
+__all__ = ["WhisperProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper.py b/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4c89b591b489054e7b00bfbf53944b733d37bf
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper.py
@@ -0,0 +1,1397 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Whisper."""
+
+import json
+import os
+import warnings
+from functools import lru_cache
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_file": "tokenizer.json",
+    "merges_file": "merges.txt",
+    "normalizer_file": "normalizer.json",
+}
+
+
+MAX_MODEL_INPUT_SIZES = {
+    "openai/whisper-base": 448,
+}
+
+
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+}
+
+# language code lookup by name, with a few language aliases
+TO_LANGUAGE_CODE = {
+    **{language: code for code, language in LANGUAGES.items()},
+    "burmese": "my",
+    "valencian": "ca",
+    "flemish": "nl",
+    "haitian": "ht",
+    "letzeburgesch": "lb",
+    "pushto": "ps",
+    "panjabi": "pa",
+    "moldavian": "ro",
+    "moldovan": "ro",
+    "sinhalese": "si",
+    "castilian": "es",
+    "mandarin": "zh",
+}
+
+TASK_IDS = ["translate", "transcribe"]
+
+
+class WhisperTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Whisper tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        normalizer_file (`str`, *optional*):
+            Path to the normalizer_file file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*):
+            The token used for padding, for example when batching sequences of different lengths.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word.
+        language (`str`, *optional*):
+            The language of the transcription text. The corresponding language id token is appended to the start of the
+            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
+            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
+        task (`str`, *optional*):
+            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
+            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
+        predict_timestamps (`bool`, *optional*, defaults to `False`):
+            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        normalizer_file=None,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_prefix_space=False,
+        language=None,
+        task=None,
+        predict_timestamps=False,
+        **kwargs,
+    ):
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        if normalizer_file is not None:
+            with open(normalizer_file, encoding="utf-8") as vocab_handle:
+                self.english_spelling_normalizer = json.load(vocab_handle)
+        else:
+            self.english_spelling_normalizer = None
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
+
+        self.language = language
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.task = task
+        self.predict_timestamps = predict_timestamps
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def set_prefix_tokens(
+        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
+    ):
+        """
+        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
+        update the prefix tokens as required when fine-tuning. Example:
+
+        ```python
+        >>> # instantiate the tokenizer and set the prefix token to Spanish
+        >>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
+        >>> # now switch the prefix token from Spanish to French
+        >>> tokenizer.set_prefix_tokens(language="french")
+        ```
+
+        Args:
+            language (`str`, *optional*, defaults to `None`):
+                The language of the transcription text.
+            task (`str`, *optional*, defaults to `None`):
+                Task identifier to append at the start of sequence (if any).
+            predict_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+        """
+        self.language = language if language is not None else self.language
+        self.task = task if task is not None else self.task
+        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
+
+    @property
+    def prefix_tokens(self) -> List[int]:
+        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
+        langs = tuple(LANGUAGES.keys())
+
+        if self.language is not None:
+            self.language = self.language.lower()
+            if self.language in TO_LANGUAGE_CODE:
+                language_id = TO_LANGUAGE_CODE[self.language]
+            elif self.language in TO_LANGUAGE_CODE.values():
+                language_id = self.language
+            else:
+                is_language_code = len(self.language) == 2
+                raise ValueError(
+                    f"Unsupported language: {self.language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+
+        if self.task is not None:
+            if self.task not in TASK_IDS:
+                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
+
+        bos_sequence = [bos_token_id]
+        if self.language is not None:
+            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
+        if self.task is not None:
+            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
+        if not self.predict_timestamps:
+            bos_sequence.append(notimestamps_token_id)
+        return bos_sequence
+
+    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize with GPT2 -> Whisper
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id with GPT2 -> Whisper
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """
+        Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV
+        tokens as "", thus we do not use the `unk_token` here.
+        """
+        return self.decoder.get(index, "")
+
+    def _normalize(self, text):
+        warnings.warn(
+            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
+        )
+        return self.normalize(text)
+
+    def _basic_normalize(self, text, remove_diacritics=False):
+        warnings.warn(
+            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
+        )
+        return self.basic_normalize(text, remove_diacritics=remove_diacritics)
+
+    def normalize(self, text):
+        """
+        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        english text.
+        """
+        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
+        return normalizer(text)
+
+    @staticmethod
+    def basic_normalize(text, remove_diacritics=False):
+        """
+        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        multilingual text.
+        """
+        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
+        return normalizer(text)
+
+    def _decode_with_timestamps(
+        self, token_ids, skip_special_tokens=False, time_precision=0.02, segment_size=1500
+    ) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
+        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        timestamp_begin = self.all_special_ids[-1] + 1
+        outputs = [[]]
+
+        cur_max_timestamp = 0.0
+        prev_segments_len = 0.0
+        penultimate_timestamp = 0.0
+
+        for i, token in enumerate(token_ids):
+            if token >= timestamp_begin:
+                timestamp = float((token - timestamp_begin) * time_precision)
+
+                if timestamp < cur_max_timestamp:
+                    # next segment has started
+                    last_was_single_ending = i >= 2 and not (
+                        token_ids[i - 1] >= timestamp_begin and token_ids[i - 2] >= timestamp_begin
+                    )
+                    if last_was_single_ending:
+                        prev_segments_len += time_precision * segment_size
+                    else:
+                        cur_max_timestamp = penultimate_timestamp
+                        prev_segments_len += penultimate_timestamp
+                        outputs = outputs[:-2]
+
+                penultimate_timestamp = cur_max_timestamp
+                cur_max_timestamp = timestamp
+
+                outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
+        return "".join(outputs)
+
+    def _compute_offsets(self, token_ids, time_precision=0.02, segment_size=1500):
+        """
+        Compute offsets for a given tokenized input
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+            segment_size (`int`, *optional*, defaults to 1500):
+                The number of features in the input mel spectrogram.
+        """
+        offsets = []
+        # ensure torch tensor of token ids is placed on cpu
+        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
+            token_ids = token_ids.cpu()
+        token_ids = np.array(token_ids)
+        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
+            raise ValueError("Can only process a single input at a time")
+        timestamp_begin = self.all_special_ids[-1] + 1
+        timestamp_tokens = token_ids >= timestamp_begin
+
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
+            # either there are no timestamps or there are no consecutive ones
+            return []
+        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
+            # we add the final timestamp if it is not already in the list
+            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
+
+        last_slice = np.where(timestamp_tokens)[0][0]
+        cur_max_timestamp = 0
+        prev_segments_len = 0
+        for current_slice in consecutive:
+            sliced_tokens = token_ids[last_slice:current_slice]
+            if len(sliced_tokens) > 1:
+                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
+
+                if start_timestamp_position < cur_max_timestamp:
+                    # next segment has started
+                    is_single_ending = last_slice >= 2 and not (
+                        token_ids[last_slice - 2] >= timestamp_begin and token_ids[last_slice - 1] >= timestamp_begin
+                    )
+                    if is_single_ending:
+                        prev_segments_len += segment_size
+                    else:
+                        prev_segments_len += cur_max_timestamp
+
+                cur_max_timestamp = end_timestamp_position
+
+                # strip timestamp tokens from the text output
+                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
+                text = self._decode(sliced_tokens)
+                text = self._filter_timestamp_ids(text)
+                offsets.append(
+                    {
+                        "text": text,
+                        "timestamp": (
+                            start_timestamp_position * time_precision + prev_segments_len * time_precision,
+                            end_timestamp_position * time_precision + prev_segments_len * time_precision,
+                        ),
+                    }
+                )
+            last_slice = current_slice
+
+        return offsets
+
+    @lru_cache
+    def timestamp_ids(self, time_precision=0.02):
+        """
+        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
+
+        Args:
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
+
+    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
+        """
+        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
+                removed.
+        """
+        if skip_special_tokens:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+
+        return token_ids
+
+    def _filter_timestamp_ids(self, token_ids):
+        return re.sub(self.timestamp_pat, "", token_ids)
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_offsets: bool = False,
+        time_precision: float = 0.02,
+        decode_with_timestamps: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
+                if present.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            output_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
+                timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
+                text if they contain timestamp tokens.
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode with timestamps included in the raw text.
+            normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
+                target text is in English. Otherwise, the basic text normalizer should be applied.
+            basic_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
+                target text.
+            remove_diacritics (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
+                destroy information in the decoded text, hence it should be used with caution.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        filtered_ids = self._preprocess_token_ids(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+        )
+
+        text = super().decode(
+            filtered_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            normalize=normalize,
+            basic_normalize=basic_normalize,
+            remove_diacritics=remove_diacritics,
+            **kwargs,
+        )
+        if decode_with_timestamps:
+            # legacy method to decode timestamps when not included in the tokenizer vocabulary
+            text = self._decode_with_timestamps(
+                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
+        else:
+            text = self._filter_timestamp_ids(text)
+
+        # retrieve offsets
+        if output_offsets:
+            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
+            return {"text": text, "offsets": offsets}
+        return text
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        text = "".join(sub_texts)
+
+        if normalize:
+            clean_text = self.normalize(text)
+            return clean_text
+        elif basic_normalize:
+            clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics)
+            return clean_text
+        else:
+            return text
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+        normalizer_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        if self.english_spelling_normalizer is not None:
+            with open(normalizer_file, "w", encoding="utf-8") as f:
+                f.write(
+                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+                )
+
+        return vocab_file, merge_file, normalizer_file
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.prepare_for_tokenization with GPT2 -> Whisper
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
+        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
+        # we don't want to force the bos token at position 1, as this is the starting token
+        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
+        # to get the forced tokens
+        forced_tokens = self.prefix_tokens[1:]
+        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
+        return forced_decoder_ids
+
+    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
+        return _decode_asr(
+            self,
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    def _strip_prompt(self, token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        if not isinstance(token_ids, list):
+            token_ids = self._convert_to_list(token_ids)
+
+        # handle case of empty token_ids for decoding with timestamps.
+        # at this point token_ids is a list, so it is safe to use if not check.
+        if not token_ids:
+            return token_ids
+
+        has_prompt = token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
+
+    @staticmethod
+    def _convert_to_list(token_ids):
+        # convert type to ndarray if necessary
+        if hasattr(token_ids, "numpy"):
+            if "torch" in str(type(token_ids)):
+                token_ids = token_ids.cpu().numpy()
+            elif "tensorflow" in str(type(token_ids)):
+                token_ids = token_ids.numpy()
+        elif "jaxlib" in str(type(token_ids)):
+            token_ids = token_ids.tolist()
+        # now the token ids are either a numpy array, or a list of lists
+        if isinstance(token_ids, np.ndarray):
+            token_ids = token_ids.tolist()
+        return token_ids
+
+
+def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language, time_precision):
+    """
+    Internal method meant to only be used by asr pipeline. Handles all the little quirks specific to whisper to handle
+    the various options not allowed in other seq2seq models
+    """
+
+    # =========== Overview ============
+    # - iterate over all outputs
+    # - all tokens within output
+    # - Each token can be
+    #   - language token
+    #   - special token
+    #   - timestamp token
+    #   - text token
+    # - We accumulate the text tokens.
+    # - We split on end timestamps
+    # - Lots of complexity comes from stride and timestamps
+
+    last_language = None
+
+    def new_chunk():
+        return {"language": last_language, "timestamp": [None, None], "text": ""}
+
+    # Welcome to the state machine !
+    chunks = []
+    chunk = new_chunk()
+    time_offset = 0.0
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    previous_tokens = []
+    previous_token_timestamps = []
+    skip = False
+    right_stride_start = None
+
+    all_special_ids = set(tokenizer.all_special_ids)
+    prompt_token_id = tokenizer.convert_tokens_to_ids("<|startofprev|>")
+    decoder_start_token_id = tokenizer.convert_tokens_to_ids("<|startoftranscript|>")
+    # - iterate over all outputs
+    for chunk_id, output in enumerate(model_outputs):
+        # We can drop everything to Python list, it's going to make
+        # our lives easier
+        token_ids = output["tokens"][0].tolist()
+        # (possibly) remove the prompt from the token ids
+        token_ids = tokenizer._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+        if return_timestamps == "word":
+            token_timestamps = output["token_timestamps"][0].tolist()
+
+        # Those keep track of timestamps within strides
+        # Which need to be skipped and resolve all tokens in a single
+        # chunk.
+        last_timestamp = None
+        first_timestamp = timestamp_begin
+
+        if "stride" in output:
+            chunk_len, stride_left, stride_right = output["stride"]
+            # Offset the timings to account for the other `model_outputs`.
+            time_offset -= stride_left
+            right_stride_start = chunk_len - stride_right
+
+            # Keeping track of timestamps within strides
+            # We're going to NOT split on those, and delay until we're
+            # out of BOTH stride. Otherwise lots of issues occur and
+            # corner cases
+            if stride_left:
+                first_timestamp = stride_left / time_precision + timestamp_begin
+            if stride_right:
+                for token in reversed(token_ids):
+                    if token >= timestamp_begin:
+                        # There can be several token in the right stride
+                        # But the last one is ALWAYS going to be skipped
+                        if (
+                            last_timestamp is not None
+                            and (token - timestamp_begin) * time_precision < right_stride_start
+                        ):
+                            break
+                        last_timestamp = token
+
+        current_tokens = []
+        current_token_timestamps = []
+
+        # - all tokens within output
+        for i, token in enumerate(token_ids):
+            # 4 possible states for each token
+            # - 1/ Language code
+            # - 2/ all other special tokens (which we ignore)
+            # - 3/ Timestamp
+            # - 4/ Regular text
+            if token in all_special_ids:
+                # Either language code or other
+                text = tokenizer.decode([token])
+                # Removing outer shell <|XX|>
+                text = text[2:-2]
+                language = LANGUAGES.get(text, None)
+                if language is not None:
+                    # 1/ Indeed some language
+                    # TODO Handle when language is different from the previous
+                    # one, and we cannot use timestamped tokens to create chunks
+                    if last_language and language != last_language and not return_timestamps:
+                        previous_tokens.append(current_tokens)
+                        resolved_tokens = _find_longest_common_sequence(previous_tokens)
+                        resolved_text = tokenizer.decode(resolved_tokens)
+                        chunk["text"] = resolved_text
+                        chunks.append(chunk)
+
+                        # Flush all our temporary context
+                        previous_tokens = []
+                        current_tokens = []
+                        chunk = new_chunk()
+                    chunk["language"] = language
+                    last_language = language
+                else:
+                    # 2/ This is a regular special token, ignoring it
+                    pass
+            elif token >= timestamp_begin:
+                # 3/ Timestamp token
+                time = (token - timestamp_begin) * time_precision + time_offset
+                time = round(time, 2)
+                if last_timestamp and token >= last_timestamp:
+                    # Whisper outputted a timestamp token, but it falls within
+                    # our stride, so we're going to skip it for the time being
+                    # and resolve this later
+                    # Skip is necessary because timestamp tokens always come
+                    # by pair, so we need to skip the next one too (which would mark the start of another chunk).
+                    skip = True
+                elif skip or (previous_tokens and token < first_timestamp):
+                    skip = False
+                elif chunk["timestamp"][0] is None:
+                    chunk["timestamp"][0] = time
+                else:
+                    # This is the end of the timestamp chunk
+                    if time == chunk["timestamp"][0]:
+                        # This is a bug in timestamp token output
+                        # where we're taking the duplicate token
+                        # as a stop where it should be a start.
+                        # This is an issue in the underlying model output
+                        # Let's just skip it so it becomes de-factor
+                        # a start agin
+                        pass
+                    else:
+                        chunk["timestamp"][1] = time
+                        # Handling merges.
+                        previous_tokens.append(current_tokens)
+                        if return_timestamps == "word":
+                            previous_token_timestamps.append(current_token_timestamps)
+                        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+                            previous_tokens, previous_token_timestamps
+                        )
+                        resolved_text = tokenizer.decode(resolved_tokens)
+                        chunk["text"] = resolved_text
+                        if return_timestamps == "word":
+                            chunk["words"] = _collate_word_timestamps(
+                                tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
+                            )
+                        chunks.append(chunk)
+
+                        # Flush all our temporary context
+                        previous_tokens = []
+                        current_tokens = []
+                        previous_token_timestamps = []
+                        current_token_timestamps = []
+                        chunk = new_chunk()
+            else:
+                # 4/ Regular token
+                # We just append to the list of all tokens so we can handle
+                # merges later and decode into text.
+                current_tokens.append(token)
+                if return_timestamps == "word":
+                    start_time = round(token_timestamps[i] + time_offset, 2)
+                    if i + 1 < len(token_timestamps):
+                        end_time = round(token_timestamps[i + 1] + time_offset, 2)
+                    else:
+                        end_time = None  # should never happen
+                    current_token_timestamps.append((start_time, end_time))
+
+        if "stride" in output:
+            time_offset += chunk_len - stride_right
+
+        # Leftover tokens
+        if current_tokens:
+            previous_tokens.append(current_tokens)
+            if return_timestamps == "word":
+                previous_token_timestamps.append(current_token_timestamps)
+        elif not (any(p for p in previous_tokens)):
+            chunk = new_chunk()
+            previous_tokens = []
+            current_tokens = []
+            previous_token_timestamps = []
+            current_token_timestamps = []
+
+    if previous_tokens:
+        if return_timestamps:
+            logger.warning(
+                "Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. "
+                "Also make sure WhisperTimeStampLogitsProcessor was used during generation."
+            )
+        # Happens when we don't use timestamps
+        resolved_tokens, resolved_token_timestamps = _find_longest_common_sequence(
+            previous_tokens, previous_token_timestamps
+        )
+        resolved_text = tokenizer.decode(resolved_tokens)
+        chunk["text"] = resolved_text
+        if return_timestamps == "word":
+            chunk["words"] = _collate_word_timestamps(
+                tokenizer, resolved_tokens, resolved_token_timestamps, last_language, return_language
+            )
+        chunks.append(chunk)
+
+    # Preparing and cleaning up the pipeline output
+    full_text = "".join(chunk["text"] for chunk in chunks)
+    if return_timestamps or return_language:
+        for chunk in chunks:
+            if not return_timestamps:
+                chunk.pop("timestamp")
+            else:
+                chunk["timestamp"] = tuple(chunk["timestamp"])
+            if not return_language:
+                chunk.pop("language")
+
+        if return_timestamps == "word":
+            new_chunks = []
+            for chunk in chunks:
+                new_chunks.extend(chunk["words"])
+            optional = {"chunks": new_chunks}
+        else:
+            optional = {"chunks": chunks}
+    else:
+        optional = {}
+    return full_text, optional
+
+
+def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
+    # It would be much harder to do O(n) because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # If token_timestamp_sequences is provided, will split those sequences in
+    # exactly the same way.
+
+    left_sequence = sequences[0]
+    left_length = len(left_sequence)
+    total_sequence = []
+
+    if token_timestamp_sequences:
+        left_token_timestamp_sequence = token_timestamp_sequences[0]
+        total_token_timestamp_sequence = []
+
+    for seq_idx, right_sequence in enumerate(sequences[1:]):
+        # index = 0
+        max_ = 0.0
+        max_indices = (left_length, left_length, 0, 0)
+        # Here we're sliding matches
+        # [a, b, c, d]
+        #          [c, d, f]
+        # =        [c] == [d]
+        #
+        # [a, b, c, d]
+        #       [c, d, f]
+        # =     [c, d] == [c, d]
+        #
+        #
+        # [a, b, c, d]
+        #    [c, d, f]
+        #
+        # =  [b, c, d] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [c, d, f]
+        #
+        # [a, b, c] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [d, f]
+        #
+        # [a, b] == [d, f]
+        #
+        # [a, b, c, d]
+        # [f]
+        #
+        # [a] == [f]
+        right_length = len(right_sequence)
+        for i in range(1, left_length + right_length):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+
+            # Slightly convoluted because we don't want out of bound indices
+            # This will be necessary for a small conflict resolution optimization
+            # later
+            left_start = max(0, left_length - i)
+            left_stop = min(left_length, left_length + right_length - i)
+            left = np.array(left_sequence[left_start:left_stop])
+
+            right_start = max(0, i - left_length)
+            right_stop = min(right_length, i)
+            right = np.array(right_sequence[right_start:right_stop])
+
+            # We can only match subsequences of the same size.
+            if len(left) != len(right):
+                raise RuntimeError(
+                    "There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference."
+                )
+
+            if token_timestamp_sequences:
+                # Get length of longest subsequence of tokens that match
+                # and have timestamps that are in order
+                matches = sum(
+                    1
+                    for idx, elem in enumerate(left)
+                    if (
+                        elem == right[idx]
+                        and left_token_timestamp_sequence[left_start + idx]
+                        <= token_timestamp_sequences[seq_idx + 1][right_start + idx]
+                    )
+                )
+
+            else:
+                matches = np.sum(left == right)
+
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                max_ = matching
+                max_indices = (left_start, left_stop, right_start, right_stop)
+
+        (left_start, left_stop, right_start, right_stop) = max_indices
+
+        # This is a small conflict optimization since those sequences overlap
+        # in audio.
+        # We're going to give more confidence to the left sequence
+        # for the left of the overlap,
+        # and to the right of the sequence, for the right of the overlap
+        left_mid = (left_stop + left_start) // 2
+        right_mid = (right_stop + right_start) // 2
+        total_sequence.extend(left_sequence[:left_mid])
+        left_sequence = right_sequence[right_mid:]
+        left_length = len(left_sequence)
+
+        if token_timestamp_sequences:
+            total_token_timestamp_sequence.extend(left_token_timestamp_sequence[:left_mid])
+            left_token_timestamp_sequence = token_timestamp_sequences[seq_idx + 1][right_mid:]
+
+    total_sequence.extend(left_sequence)
+
+    if token_timestamp_sequences is None:
+        return total_sequence
+
+    if len(token_timestamp_sequences) > 0:
+        total_token_timestamp_sequence.extend(left_token_timestamp_sequence)
+        return total_sequence, total_token_timestamp_sequence
+    else:
+        return total_sequence, []
+
+
+def _collate_word_timestamps(tokenizer, tokens, token_timestamps, language, return_language):
+    words, _, token_indices = _combine_tokens_into_words(tokenizer, tokens, language)
+
+    optional_language_field = {"language": language} if return_language else {}
+
+    timings = [
+        {
+            "text": word,
+            "timestamp": (token_timestamps[indices[0]][0], token_timestamps[indices[-1]][1]),
+            **optional_language_field,
+        }
+        for word, indices in zip(words, token_indices)
+    ]
+    return timings
+
+
+def _combine_tokens_into_words(
+    tokenizer,
+    tokens: List[int],
+    language: Optional[str] = None,
+    prepend_punctuations: str = "\"'“¡¿([{-",
+    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
+):
+    """
+    Groups tokens by word. Returns a tuple containing a list of strings with the words, and a list of `token_id`
+    sequences with the tokens making up each word.
+    """
+    if language is None:
+        language = tokenizer.language
+    if language is None:
+        language = "english"
+
+    if language in {"chinese", "japanese", "thai", "lao", "myanmar", "cantonese"}:
+        # These languages don't typically use spaces.
+        words, word_tokens, token_indices = _split_tokens_on_unicode(tokenizer, tokens)
+    else:
+        words, word_tokens, token_indices = _split_tokens_on_spaces(tokenizer, tokens)
+
+    _merge_punctuations(words, word_tokens, token_indices, prepend_punctuations, append_punctuations)
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_unicode(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at any position where the tokens are decoded as valid unicode points."""
+    decoded_full = tokenizer.decode(tokens, decode_with_timestamps=True)
+    replacement_char = "\ufffd"
+
+    words = []
+    word_tokens = []
+    token_indices = []
+    current_tokens = []
+    current_indices = []
+    unicode_offset = 0
+
+    for token_idx, token in enumerate(tokens):
+        current_tokens.append(token)
+        current_indices.append(token_idx)
+        decoded = tokenizer.decode(current_tokens, decode_with_timestamps=True)
+
+        if (
+            replacement_char not in decoded
+            or decoded_full[unicode_offset + decoded.index(replacement_char)] == replacement_char
+        ):
+            words.append(decoded)
+            word_tokens.append(current_tokens)
+            token_indices.append(current_indices)
+            current_tokens = []
+            current_indices = []
+            unicode_offset += len(decoded)
+
+    return words, word_tokens, token_indices
+
+
+def _split_tokens_on_spaces(tokenizer, tokens: List[int]):
+    """Combine tokens into words by splitting at whitespace and punctuation tokens."""
+    subwords, subword_tokens_list, subword_indices_list = _split_tokens_on_unicode(tokenizer, tokens)
+    words = []
+    word_tokens = []
+    token_indices = []
+
+    for subword, subword_tokens, subword_indices in zip(subwords, subword_tokens_list, subword_indices_list):
+        special = subword_tokens[0] >= tokenizer.eos_token_id
+        with_space = subword.startswith(" ")
+        punctuation = subword.strip() in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+        if special or with_space or punctuation or len(words) == 0:
+            words.append(subword)
+            word_tokens.append(subword_tokens)
+            token_indices.append(subword_indices)
+        else:
+            words[-1] = words[-1] + subword
+            word_tokens[-1].extend(subword_tokens)
+            token_indices[-1].extend(subword_indices)
+
+    return words, word_tokens, token_indices
+
+
+def _merge_punctuations(words, tokens, indices, prepended, appended):
+    """Merges punctuation tokens with neighboring words."""
+    # prepend punctuations
+    i = len(words) - 2
+    j = len(words) - 1
+    while i >= 0:
+        if words[i].startswith(" ") and words[i].strip() in prepended:
+            words[j] = words[i] + words[j]
+            tokens[j] = tokens[i] + tokens[j]
+            indices[j] = indices[i] + indices[j]
+            words[i] = ""
+            tokens[i] = []
+            indices[i] = []
+        else:
+            j = i
+        i -= 1
+
+    # append punctuations
+    i = 0
+    j = 1
+    while j < len(words):
+        if not words[i].endswith(" ") and words[j] in appended:
+            words[i] += words[j]
+            tokens[i] += tokens[j]
+            indices[i] += indices[j]
+            words[j] = ""
+            tokens[j] = []
+            indices[j] = []
+        else:
+            i = j
+        j += 1
+
+    # remove elements that are now empty
+    words[:] = [word for word in words if word]
+    tokens[:] = [token for token in tokens if token]
+    indices[:] = [idx for idx in indices if idx]
+
+
+__all__ = ["WhisperTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper_fast.py b/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..64dcc186bf8efe4b2c47386fc858727ebb563234
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/whisper/tokenization_whisper_fast.py
@@ -0,0 +1,639 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Whisper."""
+
+import json
+import os
+import re
+import warnings
+from functools import lru_cache
+from typing import List, Optional, Tuple
+
+import numpy as np
+from tokenizers import AddedToken, processors
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
+from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_file": "tokenizer.json",
+    "merges_file": "merges.txt",
+    "normalizer_file": "normalizer.json",
+}
+
+
+class WhisperTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        normalizer_file (`str`, *optional*):
+            Path to the normalizer_file file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
+            `"<|startoftranscript|>"` when generating.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (Whisper tokenizer detect beginning of words by the preceding space).
+        language (`str`, *optional*):
+            The language of the transcription text. The corresponding language id token is appended to the start of the
+            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
+            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
+        task (`str`, *optional*):
+            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
+            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
+        predict_timestamps (`bool`, *optional*, defaults to `False`):
+            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = WhisperTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        normalizer_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        language=None,
+        task=None,
+        predict_timestamps=False,
+        **kwargs,
+    ):
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.add_bos_token = kwargs.pop("add_bos_token", False)
+
+        if normalizer_file is not None:
+            with open(normalizer_file, encoding="utf-8") as vocab_handle:
+                self.english_spelling_normalizer = json.load(vocab_handle)
+        else:
+            self.english_spelling_normalizer = None
+
+        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
+
+        self.language = language
+        self.task = task
+        self.predict_timestamps = predict_timestamps
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
+    def _decode_with_timestamps(
+        self, token_ids, skip_special_tokens=False, time_precision=0.02, segment_size=1500
+    ) -> str:
+        """
+        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
+        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
+        """
+        timestamp_begin = self.all_special_ids[-1] + 1
+        outputs = [[]]
+
+        cur_max_timestamp = 0.0
+        prev_segments_len = 0.0
+        penultimate_timestamp = 0.0
+
+        for i, token in enumerate(token_ids):
+            if token >= timestamp_begin:
+                timestamp = float((token - timestamp_begin) * time_precision)
+
+                if timestamp < cur_max_timestamp:
+                    # next segment has started
+                    last_was_single_ending = i >= 2 and not (
+                        token_ids[i - 1] >= timestamp_begin and token_ids[i - 2] >= timestamp_begin
+                    )
+                    if last_was_single_ending:
+                        prev_segments_len += time_precision * segment_size
+                    else:
+                        cur_max_timestamp = penultimate_timestamp
+                        prev_segments_len += penultimate_timestamp
+                        outputs = outputs[:-2]
+
+                penultimate_timestamp = cur_max_timestamp
+                cur_max_timestamp = timestamp
+
+                outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
+                outputs.append([])
+            else:
+                outputs[-1].append(token)
+        outputs = [
+            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
+        ]
+        return "".join(outputs)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
+    def _compute_offsets(self, token_ids, time_precision=0.02, segment_size=1500):
+        """
+        Compute offsets for a given tokenized input
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+            segment_size (`int`, *optional*, defaults to 1500):
+                The number of features in the input mel spectrogram.
+        """
+        offsets = []
+        # ensure torch tensor of token ids is placed on cpu
+        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
+            token_ids = token_ids.cpu()
+        token_ids = np.array(token_ids)
+        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
+            raise ValueError("Can only process a single input at a time")
+        timestamp_begin = self.all_special_ids[-1] + 1
+        timestamp_tokens = token_ids >= timestamp_begin
+
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
+            # either there are no timestamps or there are no consecutive ones
+            return []
+        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
+            # we add the final timestamp if it is not already in the list
+            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
+
+        last_slice = np.where(timestamp_tokens)[0][0]
+        cur_max_timestamp = 0
+        prev_segments_len = 0
+        for current_slice in consecutive:
+            sliced_tokens = token_ids[last_slice:current_slice]
+            if len(sliced_tokens) > 1:
+                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
+
+                if start_timestamp_position < cur_max_timestamp:
+                    # next segment has started
+                    is_single_ending = last_slice >= 2 and not (
+                        token_ids[last_slice - 2] >= timestamp_begin and token_ids[last_slice - 1] >= timestamp_begin
+                    )
+                    if is_single_ending:
+                        prev_segments_len += segment_size
+                    else:
+                        prev_segments_len += cur_max_timestamp
+
+                cur_max_timestamp = end_timestamp_position
+
+                # strip timestamp tokens from the text output
+                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
+                text = self._decode(sliced_tokens)
+                text = self._filter_timestamp_ids(text)
+                offsets.append(
+                    {
+                        "text": text,
+                        "timestamp": (
+                            start_timestamp_position * time_precision + prev_segments_len * time_precision,
+                            end_timestamp_position * time_precision + prev_segments_len * time_precision,
+                        ),
+                    }
+                )
+            last_slice = current_slice
+
+        return offsets
+
+    @lru_cache
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
+    def timestamp_ids(self, time_precision=0.02):
+        """
+        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
+
+        Args:
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+        """
+        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids
+    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
+        """
+        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
+                removed.
+        """
+        if skip_special_tokens:
+            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
+            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
+
+        return token_ids
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids
+    def _filter_timestamp_ids(self, token_ids):
+        return re.sub(self.timestamp_pat, "", token_ids)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        output_offsets: bool = False,
+        time_precision: float = 0.02,
+        decode_with_timestamps: bool = False,
+        normalize: bool = False,
+        basic_normalize: bool = False,
+        remove_diacritics: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
+                if present.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            output_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
+                timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
+                text if they contain timestamp tokens.
+            time_precision (`float`, *optional*, defaults to 0.02):
+                The time ratio to convert from token to time.
+            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode with timestamps included in the raw text.
+            normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
+                target text is in English. Otherwise, the basic text normalizer should be applied.
+            basic_normalize (`bool`, *optional*, defaults to `False`):
+                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
+                target text.
+            remove_diacritics (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
+                destroy information in the decoded text, hence it should be used with caution.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        filtered_ids = self._preprocess_token_ids(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+        )
+
+        text = super().decode(
+            filtered_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            normalize=normalize,
+            basic_normalize=basic_normalize,
+            remove_diacritics=remove_diacritics,
+            **kwargs,
+        )
+        if decode_with_timestamps:
+            # legacy method to decode timestamps when not included in the tokenizer vocabulary
+            text = self._decode_with_timestamps(
+                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
+            )
+        else:
+            text = self._filter_timestamp_ids(text)
+
+        # retrieve offsets
+        if output_offsets:
+            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
+            return {"text": text, "offsets": offsets}
+        return text
+
+    def _decode(
+        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
+    ) -> str:
+        text = super()._decode(*args, **kwargs)
+
+        if normalize:
+            clean_text = self._normalize(text)
+            return clean_text
+        elif basic_normalize:
+            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
+            return clean_text
+        else:
+            return text
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize
+    def _normalize(self, text):
+        warnings.warn(
+            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
+        )
+        return self.normalize(text)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize
+    def _basic_normalize(self, text, remove_diacritics=False):
+        warnings.warn(
+            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
+        )
+        return self.basic_normalize(text, remove_diacritics=remove_diacritics)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
+    def normalize(self, text):
+        """
+        Normalize a given string using the `EnglishTextNormalizer` class, which preforms commons transformation on
+        english text.
+        """
+        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
+        return normalizer(text)
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
+    def basic_normalize(text, remove_diacritics=False):
+        """
+        Normalize a given string using the `BasicTextNormalizer` class, which preforms commons transformation on
+        multilingual text.
+        """
+        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
+        return normalizer(text)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+
+        normalizer_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
+        )
+
+        if self.english_spelling_normalizer is not None:
+            with open(normalizer_file, "w", encoding="utf-8") as f:
+                f.write(
+                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+                )
+
+        return tuple(files) + (normalizer_file,)
+
+    def set_prefix_tokens(
+        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
+    ):
+        """
+        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
+        update the prefix tokens as required when fine-tuning. Example:
+
+        ```python
+        >>> # instantiate the tokenizer and set the prefix token to Spanish
+        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
+        >>> # now switch the prefix token from Spanish to French
+        >>> tokenizer.set_prefix_tokens(language="french")
+        ```
+
+        Args:
+            language (`str`, *optional*, defaults to `None`):
+                The language of the transcription text.
+            task (`str`, *optional*, defaults to `None`):
+                Task identifier to append at the start of sequence (if any).
+            predict_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+        """
+        self.language = language if language is not None else self.language
+        self.task = task if task is not None else self.task
+        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
+
+        prefix_token_ids = self.prefix_tokens
+        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        prefix_template = " ".join([f"{token}:0" for token in prefixes])
+        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{prefix_template} $A:0 {eos}:0",
+            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
+            special_tokens=[
+                (eos, eos_token_id),
+                *zip(prefixes, prefix_token_ids),
+            ],
+        )
+
+    @property
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
+    def prefix_tokens(self) -> List[int]:
+        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
+        langs = tuple(LANGUAGES.keys())
+
+        if self.language is not None:
+            self.language = self.language.lower()
+            if self.language in TO_LANGUAGE_CODE:
+                language_id = TO_LANGUAGE_CODE[self.language]
+            elif self.language in TO_LANGUAGE_CODE.values():
+                language_id = self.language
+            else:
+                is_language_code = len(self.language) == 2
+                raise ValueError(
+                    f"Unsupported language: {self.language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+
+        if self.task is not None:
+            if self.task not in TASK_IDS:
+                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
+
+        bos_sequence = [bos_token_id]
+        if self.language is not None:
+            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
+        if self.task is not None:
+            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
+        if not self.predict_timestamps:
+            bos_sequence.append(notimestamps_token_id)
+        return bos_sequence
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
+    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
+        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
+        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
+        # we don't want to force the bos token at position 1, as this is the starting token
+        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
+        # to get the forced tokens
+        forced_tokens = self.prefix_tokens[1:]
+        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
+        return forced_decoder_ids
+
+    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
+        return _decode_asr(
+            self,
+            model_outputs,
+            return_timestamps=return_timestamps,
+            return_language=return_language,
+            time_precision=time_precision,
+        )
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
+    def get_prompt_ids(self, text: str, return_tensors="np"):
+        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
+        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
+
+        # Check for special tokens
+        prompt_text_ids = batch_encoding["input_ids"][1:]
+        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
+        if special_token_id is not None:
+            token = self.convert_ids_to_tokens(special_token_id)
+            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
+
+        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
+        return batch_encoding["input_ids"]
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
+    def _strip_prompt(self, token_ids: List[int], prompt_token_id: int, decoder_start_token_id: int):
+        if not isinstance(token_ids, list):
+            token_ids = self._convert_to_list(token_ids)
+
+        # handle case of empty token_ids for decoding with timestamps.
+        # at this point token_ids is a list, so it is safe to use if not check.
+        if not token_ids:
+            return token_ids
+
+        has_prompt = token_ids[0] == prompt_token_id
+        if has_prompt:
+            if decoder_start_token_id in token_ids:
+                return token_ids[token_ids.index(decoder_start_token_id) :]
+            else:
+                return []
+
+        return token_ids
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._convert_to_list
+    def _convert_to_list(token_ids):
+        # convert type to ndarray if necessary
+        if hasattr(token_ids, "numpy"):
+            if "torch" in str(type(token_ids)):
+                token_ids = token_ids.cpu().numpy()
+            elif "tensorflow" in str(type(token_ids)):
+                token_ids = token_ids.numpy()
+        elif "jaxlib" in str(type(token_ids)):
+            token_ids = token_ids.tolist()
+        # now the token ids are either a numpy array, or a list of lists
+        if isinstance(token_ids, np.ndarray):
+            token_ids = token_ids.tolist()
+        return token_ids
+
+
+__all__ = ["WhisperTokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/x_clip/__init__.py b/docs/transformers/build/lib/transformers/models/x_clip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9c5fea1b3147333041a432f8d50861104dc9140
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/x_clip/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_x_clip import *
+    from .modeling_x_clip import *
+    from .processing_x_clip import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/x_clip/configuration_x_clip.py b/docs/transformers/build/lib/transformers/models/x_clip/configuration_x_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..a500e5dccd90f2d1058b4a60b953ad3f26cc2d03
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/x_clip/configuration_x_clip.py
@@ -0,0 +1,381 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""X-CLIP model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XCLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the X-CLIP
+    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the X-CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XCLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import XCLIPTextModel, XCLIPTextConfig
+
+    >>> # Initializing a XCLIPTextModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XCLIPTextConfig()
+
+    >>> # Initializing a XCLIPTextConfig from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XCLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xclip_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+
+class XCLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to instantiate an X-CLIP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the X-CLIP
+    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mit_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers of the Multiframe Integration Transformer (MIT).
+        mit_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Multiframe Integration Transformer
+            (MIT).
+        mit_num_hidden_layers (`int`, *optional*, defaults to 1):
+            Number of hidden layers in the Multiframe Integration Transformer (MIT).
+        mit_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Multiframe Integration Transformer (MIT).
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"gelu_new"` and `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate.
+
+    Example:
+
+    ```python
+    >>> from transformers import XCLIPVisionModel, XCLIPVisionConfig
+
+    >>> # Initializing a XCLIPVisionModel with microsoft/xclip-base-patch32 style configuration
+    >>> configuration = XCLIPVisionConfig()
+
+    >>> # Initializing a XCLIPVisionModel model from the microsoft/xclip-base-patch32 style configuration
+    >>> model = XCLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xclip_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mit_hidden_size=512,
+        mit_intermediate_size=2048,
+        mit_num_hidden_layers=1,
+        mit_num_attention_heads=8,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        num_frames=8,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        drop_path_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mit_hidden_size = mit_hidden_size
+        self.mit_intermediate_size = mit_intermediate_size
+        self.mit_num_hidden_layers = mit_num_hidden_layers
+        self.mit_num_attention_heads = mit_num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.drop_path_rate = drop_path_rate
+
+
+class XCLIPConfig(PretrainedConfig):
+    r"""
+    [`XCLIPConfig`] is the configuration class to store the configuration of a [`XCLIPModel`]. It is used to
+    instantiate X-CLIP model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the X-CLIP
+    [microsoft/xclip-base-patch32](https://huggingface.co/microsoft/xclip-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`XCLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`XCLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        prompt_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the video specific prompt generator.
+        prompt_alpha (`float`, *optional*, defaults to 0.1):
+            Alpha value to use in the video specific prompt generator.
+        prompt_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the video specific prompt generator. If string,
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        prompt_num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads in the cross-attention of the video specific prompt generator.
+        prompt_attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers in the video specific prompt generator.
+        prompt_projection_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the projection layers in the video specific prompt generator.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* parameter. Default is used as per the original XCLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "xclip"
+    sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        prompt_layers=2,
+        prompt_alpha=0.1,
+        prompt_hidden_act="quick_gelu",
+        prompt_num_attention_heads=8,
+        prompt_attention_dropout=0.0,
+        prompt_projection_dropout=0.0,
+        logit_scale_init_value=2.6592,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = XCLIPTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `XCLIPTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = XCLIPVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `XCLIPVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `XCLIPTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `XCLIPVisionConfig` with default values.")
+
+        self.text_config = XCLIPTextConfig(**text_config)
+        self.vision_config = XCLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.prompt_layers = prompt_layers
+        self.prompt_alpha = prompt_alpha
+        self.prompt_hidden_act = prompt_hidden_act
+        self.prompt_num_attention_heads = prompt_num_attention_heads
+        self.prompt_attention_dropout = prompt_attention_dropout
+        self.prompt_projection_dropout = prompt_projection_dropout
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: XCLIPTextConfig, vision_config: XCLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`XCLIPConfig`] (or a derived class) from xclip text model configuration and xclip vision model
+        configuration.
+
+        Returns:
+            [`XCLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+
+__all__ = ["XCLIPConfig", "XCLIPTextConfig", "XCLIPVisionConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/docs/transformers/build/lib/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f36b1905586efdfa18217f976aa42357f31835f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py
@@ -0,0 +1,386 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import gdown
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    CLIPTokenizer,
+    CLIPTokenizerFast,
+    VideoMAEImageProcessor,
+    XCLIPConfig,
+    XCLIPModel,
+    XCLIPProcessor,
+    XCLIPTextConfig,
+    XCLIPVisionConfig,
+)
+
+
+def get_xclip_config(model_name, num_frames):
+    text_config = XCLIPTextConfig()
+
+    # derive patch size from model name
+    start_idx = model_name.find("patch")
+    patch_size = int(model_name[start_idx + len("patch") : start_idx + len("patch") + 2])
+    vision_config = XCLIPVisionConfig(patch_size=patch_size, num_frames=num_frames)
+
+    if "large" in model_name:
+        text_config.hidden_size = 768
+        text_config.intermediate_size = 3072
+        text_config.num_attention_heads = 12
+
+        vision_config.hidden_size = 1024
+        vision_config.intermediate_size = 4096
+        vision_config.num_attention_heads = 16
+        vision_config.num_hidden_layers = 24
+        vision_config.mit_hidden_size = 768
+        vision_config.mit_intermediate_size = 3072
+
+    if model_name == "xclip-large-patch14-16-frames":
+        vision_config.image_size = 336
+
+    config = XCLIPConfig.from_text_vision_configs(text_config, vision_config)
+
+    if "large" in model_name:
+        config.projection_dim = 768
+
+    return config
+
+
+def rename_key(name):
+    # text encoder
+    if name == "token_embedding.weight":
+        name = name.replace("token_embedding.weight", "text_model.embeddings.token_embedding.weight")
+    if name == "positional_embedding":
+        name = name.replace("positional_embedding", "text_model.embeddings.position_embedding.weight")
+    if "ln_1" in name:
+        name = name.replace("ln_1", "layer_norm1")
+    if "ln_2" in name:
+        name = name.replace("ln_2", "layer_norm2")
+    if "c_fc" in name:
+        name = name.replace("c_fc", "fc1")
+    if "c_proj" in name:
+        name = name.replace("c_proj", "fc2")
+    if name.startswith("transformer.resblocks"):
+        name = name.replace("transformer.resblocks", "text_model.encoder.layers")
+    if "attn.out_proj" in name and "message" not in name:
+        name = name.replace("attn.out_proj", "self_attn.out_proj")
+    if "ln_final" in name:
+        name = name.replace("ln_final", "text_model.final_layer_norm")
+    # visual encoder
+    if name == "visual.class_embedding":
+        name = name.replace("visual.class_embedding", "vision_model.embeddings.class_embedding")
+    if name == "visual.positional_embedding":
+        name = name.replace("visual.positional_embedding", "vision_model.embeddings.position_embedding.weight")
+    if name.startswith("visual.transformer.resblocks"):
+        name = name.replace("visual.transformer.resblocks", "vision_model.encoder.layers")
+    if "visual.conv1" in name:
+        name = name.replace("visual.conv1", "vision_model.embeddings.patch_embedding")
+    if "visual.ln_pre" in name:
+        name = name.replace("visual.ln_pre", "vision_model.pre_layernorm")
+    if "visual.ln_post" in name:
+        name = name.replace("visual.ln_post", "vision_model.post_layernorm")
+    if "visual.proj" in name:
+        name = name.replace("visual.proj", "visual_projection.weight")
+    if "text_projection" in name:
+        name = name.replace("text_projection", "text_projection.weight")
+    # things on top
+    if "prompts_visual_proj" in name:
+        name = name.replace("prompts_visual_proj", "prompts_visual_projection")
+    if "prompts_visual_ln" in name:
+        name = name.replace("prompts_visual_ln", "prompts_visual_layernorm")
+    # mit
+    if name == "mit.positional_embedding":
+        name = name.replace("positional", "position")
+    if name.startswith("mit.resblocks"):
+        name = name.replace("mit.resblocks", "mit.encoder.layers")
+    # prompts generator
+    if name.startswith("prompts_generator.norm"):
+        name = name.replace("prompts_generator.norm", "prompts_generator.layernorm")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "attn.in_proj" in key:
+            key_split = key.split(".")
+            if key.startswith("visual"):
+                layer_num = key_split[3]
+                dim = config.vision_config.hidden_size
+                if "message_attn" in key:
+                    if "weight" in key:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.weight"] = val[
+                            :dim, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.weight"] = val[
+                            dim : dim * 2, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.weight"] = val[
+                            -dim:, :
+                        ]
+                    else:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.q_proj.bias"] = val[
+                            :dim
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.k_proj.bias"] = val[
+                            dim : dim * 2
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.message_attn.v_proj.bias"] = val[
+                            -dim:
+                        ]
+                else:
+                    if "weight" in key:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[
+                            :dim, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                            dim : dim * 2, :
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[
+                            -dim:, :
+                        ]
+                    else:
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                            dim : dim * 2
+                        ]
+                        orig_state_dict[f"vision_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+            elif key.startswith("mit"):
+                layer_num = key_split[2]
+                dim = config.vision_config.mit_hidden_size
+                if "weight" in key:
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[dim : dim * 2, :]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+                else:
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[dim : dim * 2]
+                    orig_state_dict[f"mit.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+            else:
+                layer_num = key_split[2]
+                dim = config.text_config.hidden_size
+                if "weight" in key:
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.weight"] = val[:dim, :]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.weight"] = val[
+                        dim : dim * 2, :
+                    ]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.weight"] = val[-dim:, :]
+                else:
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.q_proj.bias"] = val[:dim]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.k_proj.bias"] = val[
+                        dim : dim * 2
+                    ]
+                    orig_state_dict[f"text_model.encoder.layers.{layer_num}.self_attn.v_proj.bias"] = val[-dim:]
+        else:
+            new_key_name = rename_key(key)
+            if new_key_name in ["visual_projection.weight", "text_projection.weight"]:
+                val = val.T
+            orig_state_dict[new_key_name] = val
+
+    return orig_state_dict
+
+
+def prepare_video(num_frames):
+    if num_frames == 8:
+        filename = "eating_spaghetti_8_frames.npy"
+    elif num_frames == 16:
+        filename = "eating_spaghetti.npy"
+    elif num_frames == 32:
+        filename = "eating_spaghetti_32_frames.npy"
+    file = hf_hub_download(
+        repo_id="hf-internal-testing/spaghetti-video",
+        filename=filename,
+        repo_type="dataset",
+    )
+    video = np.load(file)
+    return list(video)
+
+
+def convert_xclip_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
+    model_to_url = {
+        # fully supervised kinetics-400 checkpoints
+        "xclip-base-patch32": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_8.pth",
+        "xclip-base-patch32-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_32_16.pth"
+        ),
+        "xclip-base-patch16": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_8.pth",
+        "xclip-base-patch16-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k400_16_16.pth"
+        ),
+        "xclip-large-patch14": "https://drive.google.com/u/0/uc?id=1NUOImq0o5DlQTST17iIP3vG7DgmHQuCx&amp;export=download&amp;confirm=t&amp;uuid=b26caedc-88e2-473e-830a-9d158b653cdb",
+        "xclip-large-patch14-16-frames": "https://drive.google.com/u/0/uc?id=1FOYgnJc097OJ4lGwtRCCydQyVPJEOH7d&amp;export=download&amp;confirm=t&amp;uuid=538fa810-e671-4050-b385-9a623f89804f",
+        # fully supervised kinetics-600 checkpoints
+        "xclip-base-patch16-kinetics-600": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_8.pth"
+        ),
+        "xclip-base-patch16-kinetics-600-16-frames": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/k600_16_16.pth"
+        ),
+        "xclip-large-patch14-kinetics-600": "https://drive.google.com/u/0/uc?id=1FV8C1INuM91sLAN4ImjzePLIlpMSihwV&amp;export=download&amp;confirm=t&amp;uuid=141d4977-4a65-44ae-864f-4b0c19f838be",
+        # few shot
+        "xclip-base-patch16-hmdb-2-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_2.pth"
+        ),
+        "xclip-base-patch16-hmdb-4-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_4.pth"
+        ),
+        "xclip-base-patch16-hmdb-8-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_8.pth"
+        ),
+        "xclip-base-patch16-hmdb-16-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_hmdb_16.pth"
+        ),
+        "xclip-base-patch16-ucf-2-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_2.pth"
+        ),
+        "xclip-base-patch16-ucf-4-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_4.pth"
+        ),
+        "xclip-base-patch16-ucf-8-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_8.pth"
+        ),
+        "xclip-base-patch16-ucf-16-shot": (
+            "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/few_ucf_16.pth"
+        ),
+        # zero shot
+        "xclip-base-patch16-zero-shot": "https://github.com/nbl97/X-CLIP_Model_Zoo/releases/download/v1.0/zero.pth",
+    }
+
+    checkpoint_url = model_to_url[model_name]
+    num_frames = 8
+    if "16-frames" in model_name:
+        num_frames = 16
+    elif "shot" in model_name:
+        num_frames = 32
+
+    config = get_xclip_config(model_name, num_frames)
+    model = XCLIPModel(config)
+    model.eval()
+
+    if "drive" in checkpoint_url:
+        output = "pytorch_model.bin"
+        gdown.cached_download(checkpoint_url, output, quiet=False)
+        state_dict = torch.load(output, map_location="cpu", weights_only=True)["model"]
+    else:
+        state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)["model"]
+
+    state_dict = convert_state_dict(state_dict, config)
+
+    model = XCLIPModel(config)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    assert missing_keys == ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
+    model.eval()
+
+    size = 336 if model_name == "xclip-large-patch14-16-frames" else 224
+    image_processor = VideoMAEImageProcessor(size=size)
+    slow_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    fast_tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
+    processor = XCLIPProcessor(image_processor=image_processor, tokenizer=fast_tokenizer)
+
+    video = prepare_video(num_frames)
+    inputs = processor(
+        text=["playing sports", "eating spaghetti", "go shopping"], videos=video, return_tensors="pt", padding=True
+    )
+
+    print("Shape of pixel values:", inputs.pixel_values.shape)
+
+    with torch.no_grad():
+        outputs = model(**inputs)
+
+    # Verify outputs
+    logits_per_video = outputs.logits_per_video
+    probs = logits_per_video.softmax(dim=1)
+    print("Probs:", probs)
+    # kinetics-400
+    if model_name == "xclip-base-patch32":
+        expected_probs = torch.tensor([[0.0019, 0.9951, 0.0030]])
+    elif model_name == "xclip-base-patch32-16-frames":
+        expected_probs = torch.tensor([[7.0999e-04, 9.9883e-01, 4.5580e-04]])
+    elif model_name == "xclip-base-patch16":
+        expected_probs = torch.tensor([[0.0083, 0.9681, 0.0236]])
+    elif model_name == "xclip-base-patch16-16-frames":
+        expected_probs = torch.tensor([[7.6937e-04, 9.9728e-01, 1.9473e-03]])
+    elif model_name == "xclip-large-patch14":
+        expected_probs = torch.tensor([[0.0062, 0.9864, 0.0075]])
+    elif model_name == "xclip-large-patch14-16-frames":
+        expected_probs = torch.tensor([[3.3877e-04, 9.9937e-01, 2.8888e-04]])
+    # kinetics-600
+    elif model_name == "xclip-base-patch16-kinetics-600":
+        expected_probs = torch.tensor([[0.0555, 0.8914, 0.0531]])
+    elif model_name == "xclip-base-patch16-kinetics-600-16-frames":
+        expected_probs = torch.tensor([[3.8554e-04, 9.9929e-01, 3.2754e-04]])
+    elif model_name == "xclip-large-patch14-kinetics-600":
+        expected_probs = torch.tensor([[0.0036, 0.9920, 0.0045]])
+    # few shot
+    elif model_name == "xclip-base-patch16-hmdb-2-shot":
+        expected_probs = torch.tensor([[7.1890e-06, 9.9994e-01, 5.6559e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-4-shot":
+        expected_probs = torch.tensor([[1.0320e-05, 9.9993e-01, 6.2435e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-8-shot":
+        expected_probs = torch.tensor([[4.1377e-06, 9.9990e-01, 9.8386e-05]])
+    elif model_name == "xclip-base-patch16-hmdb-16-shot":
+        expected_probs = torch.tensor([[4.1347e-05, 9.9962e-01, 3.3411e-04]])
+    elif model_name == "xclip-base-patch16-ucf-2-shot":
+        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
+    elif model_name == "xclip-base-patch16-ucf-4-shot":
+        expected_probs = torch.tensor([[8.5857e-05, 9.9928e-01, 6.3291e-04]])
+    elif model_name == "xclip-base-patch16-ucf-8-shot":
+        expected_probs = torch.tensor([[0.0027, 0.9904, 0.0070]])
+    elif model_name == "xclip-base-patch16-ucf-16-shot":
+        expected_probs = torch.tensor([[9.8219e-04, 9.9593e-01, 3.0863e-03]])
+    # zero shot
+    elif model_name == "xclip-base-patch16-zero-shot":
+        expected_probs = torch.tensor([[3.5082e-04, 9.9785e-01, 1.7966e-03]])
+    else:
+        raise ValueError(f"Model name {model_name} not supported")
+    assert torch.allclose(probs, expected_probs, atol=1e-3)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print("Pushing model, processor and slow tokenizer files to the hub...")
+        model.push_to_hub(model_name, organization="nielsr")
+        processor.push_to_hub(model_name, organization="nielsr")
+        slow_tokenizer.push_to_hub(model_name, organization="nielsr")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="xclip-base-patch32",
+        type=str,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_xclip_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/docs/transformers/build/lib/transformers/models/x_clip/modeling_x_clip.py b/docs/transformers/build/lib/transformers/models/x_clip/modeling_x_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a36559610766e741b1079fc9ebbe11dd6757f2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/x_clip/modeling_x_clip.py
@@ -0,0 +1,1683 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch X-CLIP model."""
+
+import copy
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from .configuration_x_clip import XCLIPConfig, XCLIPTextConfig, XCLIPVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "microsoft/xclip-base-patch32"
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->x_clip
+def x_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class XCLIPOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for video-text similarity.
+        logits_per_video (`torch.FloatTensor` of shape `(video_batch_size, text_batch_size)`):
+            The scaled dot product scores between `video_embeds` and `text_embeds`. This represents the video-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, video_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `video_embeds`. This represents the text-video
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`XCLIPTextModel`].
+        video_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The video embeddings obtained by applying the projection layer to the pooled output of
+            [`XCLIPVisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`XCLIPTextModel`].
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`XCLIPVisionModel`].
+        mit_output (`BaseModelOutputWithPooling`):
+            The output of `XCLIPMultiframeIntegrationTransformer` (MIT for short).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_video: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    video_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    mit_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["text_model_output", "vision_model_output", "mit_output"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->XCLIP
+class XCLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: XCLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->XCLIP
+class XCLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: XCLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+class XCLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation != "flash_attention_2":
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+        else:
+            self.is_causal = causal_attention_mask is not None
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class XCLIPMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->XCLIP
+class XCLIPEncoderLayer(nn.Module):
+    def __init__(self, config: XCLIPConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = XCLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = XCLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->XCLIP
+class XCLIPDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class XCLIPVisionEncoderLayer(nn.Module):
+    """
+    This corresponds to the `CrossFramelAttentionBlock` class in the original implementation.
+    """
+
+    def __init__(self, config: XCLIPConfig):
+        super().__init__()
+        self.num_frames = config.num_frames
+        self.embed_dim = config.hidden_size
+
+        self.message_fc = nn.Linear(self.embed_dim, self.embed_dim)
+        self.message_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.message_attn = XCLIPAttention(config)
+
+        self.drop_path = XCLIPDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+
+        self.self_attn = XCLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = XCLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        batch_time, seq_length, hidden_size = hidden_states.size()
+        batch_size = batch_time // self.num_frames
+        msg_token = self.message_fc(hidden_states[:, 0, :])
+        msg_token = msg_token.view(batch_size, self.num_frames, hidden_size)
+
+        msg_token = msg_token + self.drop_path(self.message_attn(self.message_ln(msg_token))[0])
+        # add dummy sequence dimension
+        msg_token = msg_token.view(-1, 1, hidden_size)
+
+        hidden_states = torch.cat([hidden_states, msg_token], dim=1)
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        hidden_states = hidden_states[:, :seq_length, :]
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class XCLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XCLIPConfig
+    base_model_prefix = "x_clip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, XCLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, XCLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, XCLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, XCLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, XCLIPModel):
+            factor = self.config.initializer_factor
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * factor,
+            )
+            nn.init.normal_(module.prompts_visual_projection, mean=0.0, std=module.vision_embed_dim**-0.5 * factor)
+        elif isinstance(module, XCLIPMultiframeIntegrationTransformer):
+            nn.init.normal_(module.position_embedding, std=self.config.initializer_factor)
+
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+X_CLIP_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`XCLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+X_CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+X_CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+X_CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->XCLIP
+class XCLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`XCLIPEncoderLayer`].
+
+    Args:
+        config: XCLIPConfig
+    """
+
+    def __init__(self, config: XCLIPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class XCLIPTextTransformer(nn.Module):
+    def __init__(self, config: XCLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = XCLIPTextEmbeddings(config)
+        self.encoder = XCLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # X_CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class XCLIPTextModel(XCLIPPreTrainedModel):
+    config_class = XCLIPTextConfig
+
+    def __init__(self, config: XCLIPTextConfig):
+        super().__init__(config)
+        self.text_model = XCLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XCLIPTextModel
+
+        >>> model = XCLIPTextModel.from_pretrained("microsoft/xclip-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class XCLIPVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`XCLIPVisionEncoderLayer`].
+
+    Args:
+        config: XCLIPConfig
+    """
+
+    def __init__(self, config: XCLIPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class XCLIPVisionTransformer(nn.Module):
+    """
+    This corresponds to the `CrossFrameCommunicationTransformer` class in the original implementation.
+    """
+
+    def __init__(self, config: XCLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = XCLIPVisionEmbeddings(config)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = XCLIPVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layernorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class XCLIPVisionModel(XCLIPPreTrainedModel):
+    config_class = XCLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: XCLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = XCLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from transformers import AutoProcessor, XCLIPVisionModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 16 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
+
+        >>> batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        >>> pixel_values = pixel_values.reshape(-1, num_channels, height, width)
+
+        >>> outputs = model(pixel_values)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class XCLIPMultiframeIntegrationTransformer(nn.Module):
+    """
+    This corresponds to the `MultiframeIntegrationTransformer` class in the original implementation.
+    """
+
+    def __init__(self, config: XCLIPVisionConfig):
+        super().__init__()
+
+        self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size))
+        self.encoder = XCLIPEncoder(config)
+
+    def forward(
+        self,
+        hidden_states,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        residual = hidden_states
+
+        # add position embeddings
+        hidden_states = hidden_states + self.position_embedding
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual
+
+        pooled_output = last_hidden_state.mean(dim=1, keepdim=False)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class XCLIPCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_heads = config.prompt_num_attention_heads
+
+        dim = config.projection_dim
+        head_dim = dim // self.num_heads
+        self.scale = head_dim**-0.5
+
+        self.q_proj = nn.Linear(dim, dim, bias=False)
+        self.k_proj = nn.Linear(dim, dim, bias=False)
+        self.v_proj = nn.Linear(dim, dim, bias=False)
+
+        self.attn_drop = nn.Dropout(config.prompt_attention_dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(config.prompt_projection_dropout)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(self, queries, keys, values):
+        """Input shape: Batch x Time x Channel"""
+        batch_size, query_seq_len, hidden_size = queries.shape
+        batch_size, key_seq_len, hidden_size = keys.shape
+        queries = (
+            self.q_proj(queries)
+            .reshape(batch_size, query_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        keys = (
+            self.k_proj(keys)
+            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        values = (
+            self.v_proj(values)
+            .reshape(batch_size, key_seq_len, self.num_heads, hidden_size // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+
+        attn = (queries @ keys.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ values).transpose(1, 2).reshape(batch_size, query_seq_len, hidden_size)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class PromptGeneratorLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        embed_dim = config.projection_dim
+        self.cross_attn = XCLIPCrossAttention(config)
+        self.norm1 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
+        self.norm3 = nn.LayerNorm(embed_dim, eps=config.text_config.layer_norm_eps)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, embed_dim * 4),
+            ACT2FN[config.prompt_hidden_act],
+            nn.Dropout(config.prompt_attention_dropout),
+            nn.Linear(embed_dim * 4, embed_dim),
+        )
+
+    def forward(self, x, visual):
+        x = x + self.cross_attn(self.norm1(x), visual, visual)
+        x = x + self.mlp(self.norm3(x))
+        return x
+
+
+class XCLIPPromptGenerator(nn.Module):
+    """This corresponds to the `VideoSpecificPrompt` class in the original implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.projection_dim
+        self.layernorm = nn.LayerNorm(embed_dim, eps=config.vision_config.layer_norm_eps)
+        self.decoder = nn.ModuleList([PromptGeneratorLayer(config) for _ in range(config.prompt_layers)])
+        self.alpha = nn.Parameter(torch.ones(embed_dim) * config.prompt_alpha)
+
+    def forward(self, text, visual):
+        visual = self.layernorm(visual)
+        for layer in self.decoder:
+            text = layer(text, visual)
+
+        return self.alpha * text
+
+
+@add_start_docstrings(X_CLIP_START_DOCSTRING)
+class XCLIPModel(XCLIPPreTrainedModel):
+    config_class = XCLIPConfig
+
+    def __init__(self, config: XCLIPConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, XCLIPTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type XCLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, XCLIPVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type XCLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = XCLIPTextTransformer(text_config)
+        self.vision_model = XCLIPVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        self.prompts_visual_layernorm = nn.LayerNorm(self.vision_embed_dim, eps=config.vision_config.layer_norm_eps)
+        self.prompts_visual_projection = nn.Parameter(torch.randn(self.vision_embed_dim, self.projection_dim))
+        mit_config = copy.copy(vision_config)
+        mit_config.hidden_size = vision_config.mit_hidden_size
+        mit_config.intermediate_size = vision_config.mit_intermediate_size
+        mit_config.num_hidden_layers = vision_config.mit_num_hidden_layers
+        mit_config.num_attention_heads = vision_config.mit_num_attention_heads
+        self.mit = XCLIPMultiframeIntegrationTransformer(mit_config)
+
+        self.prompts_generator = XCLIPPromptGenerator(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`XCLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        return text_embeds
+
+    @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING)
+    def get_video_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            video_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The video embeddings obtained by
+            applying the projection layer to the pooled output of [`XCLIPVisionModel`] and
+            [`XCLIPMultiframeIntegrationTransformer`].
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = processor(videos=list(video), return_tensors="pt")
+
+        >>> video_features = model.get_video_features(**inputs)
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(-1, num_channels, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        video_embeds = vision_outputs[1]
+        video_embeds = self.visual_projection(video_embeds)
+
+        cls_features = video_embeds.view(batch_size, num_frames, -1)
+
+        mit_outputs = self.mit(
+            cls_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        video_embeds = mit_outputs[1]
+
+        return video_embeds
+
+    @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, XCLIPOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> import av
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from huggingface_hub import hf_hub_download
+
+        >>> np.random.seed(0)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`List[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`List[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample 8 frames
+        >>> indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
+        >>> video = read_video_pyav(container, indices)
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
+        >>> model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")
+
+        >>> inputs = processor(
+        ...     text=["playing sports", "eating spaghetti", "go shopping"],
+        ...     videos=list(video),
+        ...     return_tensors="pt",
+        ...     padding=True,
+        ... )
+
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> logits_per_video = outputs.logits_per_video  # this is the video-text similarity score
+        >>> probs = logits_per_video.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        >>> print(probs)
+        tensor([[1.9496e-04, 9.9960e-01, 2.0825e-04]])
+        ```"""
+        # Use X_CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.reshape(-1, num_channels, height, width)
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        video_embeds = vision_outputs[1]
+        video_embeds = self.visual_projection(video_embeds)
+
+        cls_features = video_embeds.view(batch_size, num_frames, -1)
+
+        mit_outputs = self.mit(
+            cls_features,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        video_embeds = mit_outputs[1]
+
+        img_features = vision_outputs[0][:, 1:, :]
+        img_features = self.prompts_visual_layernorm(img_features)
+        img_features = img_features @ self.prompts_visual_projection
+        img_features = img_features.view(batch_size, num_frames, -1, video_embeds.shape[-1])
+        img_features = img_features.mean(dim=1, keepdim=False)
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1)
+        text_embeds = text_embeds + self.prompts_generator(text_embeds, img_features)
+
+        # normalized features
+        video_embeds = video_embeds / video_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_video = torch.einsum("bd,bkd->bk", video_embeds, logit_scale * text_embeds)
+        logits_per_text = logits_per_video.T
+
+        loss = None
+        if return_loss:
+            loss = x_clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return XCLIPOutput(
+            loss=loss,
+            logits_per_video=logits_per_video,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            video_embeds=video_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+            mit_output=mit_outputs,
+        )
+
+
+__all__ = ["XCLIPModel", "XCLIPPreTrainedModel", "XCLIPTextModel", "XCLIPVisionModel"]
diff --git a/docs/transformers/build/lib/transformers/models/x_clip/processing_x_clip.py b/docs/transformers/build/lib/transformers/models/x_clip/processing_x_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..66568a4fee270eed0c2f08a422617b2d3789a6f3
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/x_clip/processing_x_clip.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for XCLIP
+"""
+
+import warnings
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class XCLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
+
+    [`XCLIPProcessor`] offers all the functionalities of [`VideoMAEImageProcessor`] and [`CLIPTokenizerFast`]. See the
+    [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`VideoMAEImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "VideoMAEImageProcessor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
+        VideoMAEImageProcessor's [`~VideoMAEImageProcessor.__call__`] if `videos` is not `None`. Please refer to the
+        docstring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`,:
+                `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
+                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
+                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
+                channels.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
+        """
+
+        if text is None and videos is None:
+            raise ValueError("You have to specify either text or videos. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if videos is not None:
+            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and videos is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "attention_mask", "position_ids", "pixel_values"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["XCLIPProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/__init__.py b/docs/transformers/build/lib/transformers/models/xglm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eefd79d4cf7c3e9a24aa2f06d40f354a8fc28b2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xglm import *
+    from .modeling_flax_xglm import *
+    from .modeling_tf_xglm import *
+    from .modeling_xglm import *
+    from .tokenization_xglm import *
+    from .tokenization_xglm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xglm/configuration_xglm.py b/docs/transformers/build/lib/transformers/models/xglm/configuration_xglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5ded49162306551984b7ae3d7cc62319970e16
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/configuration_xglm.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XGLM model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XGLMModel`]. It is used to instantiate an XGLM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the XGLM
+    [facebook/xglm-564M](https://huggingface.co/facebook/xglm-564M) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256008):
+            Vocabulary size of the XGLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XGLMModel`] or [`FlaxXGLMModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimension of the layers and the pooler layer.
+        ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        num_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers Transformer decoder.
+        attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, dencoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import XGLMModel, XGLMConfig
+
+    >>> # Initializing a XGLM facebook/xglm-564M style configuration
+    >>> configuration = XGLMConfig()
+
+    >>> # Initializing a model from the facebook/xglm-564M style configuration
+    >>> model = XGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xglm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    attribute_map = {
+        "num_attention_heads": "attention_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=256008,
+        max_position_embeddings=2048,
+        d_model=1024,
+        ffn_dim=4096,
+        num_layers=24,
+        attention_heads=16,
+        activation_function="gelu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        layerdrop=0.0,
+        init_std=0.02,
+        scale_embedding=True,
+        use_cache=True,
+        decoder_start_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.ffn_dim = ffn_dim
+        self.num_layers = num_layers
+        self.attention_heads = attention_heads
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.use_cache = use_cache
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["XGLMConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py b/docs/transformers/build/lib/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc898196260e601b55cb197dd7082c0c19f1ae2f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/convert_xglm_original_ckpt_to_trfms.py
@@ -0,0 +1,68 @@
+import argparse
+from argparse import Namespace
+
+import torch
+from torch import nn
+
+from transformers import XGLMConfig, XGLMForCausalLM
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "decoder.version",
+        "decoder.output_projection.weight",
+        "_float_tensor",
+        "decoder.embed_positions._float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+def convert_fairseq_xglm_checkpoint_from_disk(checkpoint_path):
+    checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+    args = Namespace(**checkpoint["cfg"]["model"])
+    state_dict = checkpoint["model"]
+    remove_ignore_keys_(state_dict)
+    vocab_size = state_dict["decoder.embed_tokens.weight"].shape[0]
+
+    state_dict = {key.replace("decoder", "model"): val for key, val in state_dict.items()}
+
+    config = XGLMConfig(
+        vocab_size=vocab_size,
+        max_position_embeddings=args.max_target_positions,
+        num_layers=args.decoder_layers,
+        attention_heads=args.decoder_attention_heads,
+        ffn_dim=args.decoder_ffn_embed_dim,
+        d_model=args.decoder_embed_dim,
+        layerdrop=args.decoder_layerdrop,
+        dropout=args.dropout,
+        attention_dropout=args.attention_dropout,
+        activation_dropout=args.activation_dropout,
+        activation_function="gelu",
+        scale_embedding=not args.no_scale_embedding,
+        tie_word_embeddings=args.share_decoder_input_output_embed,
+    )
+
+    model = XGLMForCausalLM(config)
+    missing = model.load_state_dict(state_dict, strict=False)
+    print(missing)
+    model.lm_head = make_linear_from_emb(model.model.embed_tokens)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+    model = convert_fairseq_xglm_checkpoint_from_disk(args.fairseq_path)
+    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/xglm/modeling_flax_xglm.py b/docs/transformers/build/lib/transformers/models/xglm/modeling_flax_xglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b7a933e4dbaddd14c9a4cc03d857a5f9171fa1b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/modeling_flax_xglm.py
@@ -0,0 +1,803 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax XGLM model."""
+
+import math
+import random
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_xglm import XGLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
+_CONFIG_FOR_DOC = "XGLMConfig"
+
+XGLM_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`XGLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+XGLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def create_sinusoidal_positions(n_pos, dim, padding_idx=1):
+    half_dim = dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = np.exp(np.arange(half_dim) * -emb)
+    emb = np.expand_dims(np.arange(n_pos), 1) * np.expand_dims(emb, 0)
+    emb = np.concatenate([np.sin(emb), np.cos(emb)], 1)
+    emb = np.reshape(emb, (n_pos, dim))
+
+    if padding_idx is not None:
+        emb[padding_idx, :] = 0
+
+    return jnp.array(emb)
+
+
+class FlaxXGLMAttention(nn.Module):
+    config: XGLMConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} "
+                f"and `num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend
+            # to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            # self_attention
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class FlaxXGLMDecoderLayer(nn.Module):
+    config: XGLMConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxXGLMAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+
+        if self.config.add_cross_attention:
+            self.encoder_attn = FlaxXGLMAttention(
+                config=self.config,
+                embed_dim=self.embed_dim,
+                num_heads=self.config.decoder_attention_heads,
+                dropout=self.config.attention_dropout,
+                dtype=self.dtype,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+        self.fc1 = nn.Dense(
+            self.config.ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    # Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer.__call__
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> Tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
+        )
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class FlaxXGLMDecoderLayerCollection(nn.Module):
+    config: XGLMConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxXGLMDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_layers)
+        ]
+        self.layerdrop = self.config.layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    init_cache=init_cache,
+                    output_attentions=output_attentions,
+                    deterministic=deterministic,
+                )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_self_attns, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class FlaxXGLMModule(nn.Module):
+    config: XGLMConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.max_target_positions = self.config.max_position_embeddings
+        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            embed_dim,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+        # XGLM is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        self.embed_positions = create_sinusoidal_positions(
+            self.config.max_position_embeddings + self.offset, embed_dim
+        )
+        self.layers = FlaxXGLMDecoderLayerCollection(self.config, self.dtype)
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        input_shape = input_ids.shape
+        input_ids = input_ids.reshape(-1, input_shape[-1])
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # embed positions
+        position_ids = position_ids + self.offset
+        positions = jnp.take(self.embed_positions, position_ids, axis=0)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_states = outputs[0]
+        last_hidden_states = self.layer_norm(last_hidden_states)
+
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_states,)
+
+        if not return_dict:
+            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_states,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class FlaxXGLMPreTrainedModel(FlaxPreTrainedModel):
+    config_class = XGLMConfig
+    base_model_prefix: str = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: XGLMConfig,
+        input_shape: Tuple[int] = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if encoder_hidden_states is not None and encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+        # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+        # changed by FlaxXGLMAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XGLM_START_DOCSTRING,
+)
+class FlaxXGLMModel(FlaxXGLMPreTrainedModel):
+    module_class = FlaxXGLMModule
+
+
+append_call_sample_docstring(
+    FlaxXGLMModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxXGLMForCausalLMModule(nn.Module):
+    config: XGLMConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.model = FlaxXGLMModule(self.config, self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.variables["params"]["embed_tokens"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=lm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    XGLM_START_DOCSTRING,
+)
+class FlaxXGLMForCausalLM(FlaxXGLMPreTrainedModel):
+    module_class = FlaxXGLMForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPT2 uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxXGLMForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = ["FlaxXGLMForCausalLM", "FlaxXGLMModel", "FlaxXGLMPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/modeling_tf_xglm.py b/docs/transformers/build/lib/transformers/models/xglm/modeling_tf_xglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc9f82900a30e70d6f70aac41af779cc29eaf389
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/modeling_tf_xglm.py
@@ -0,0 +1,1008 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 XGLM model."""
+
+from __future__ import annotations
+
+import math
+import random
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+
+# Public API
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutputWithPastAndCrossAttentions, TFCausalLMOutputWithCrossAttentions
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSharedEmbeddings,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import logging
+from .configuration_xglm import XGLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
+_CONFIG_FOR_DOC = "XGLMConfig"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+def create_sinusoidal_positions(num_positions: int, embedding_dim: int, padding_idx: Optional[int]) -> tf.Tensor:
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = tf.exp(tf.range(half_dim, dtype=tf.float32) * -emb)
+    emb = tf.expand_dims(tf.range(num_positions, dtype=tf.float32), axis=1) * tf.expand_dims(emb, axis=0)
+    emb = tf.reshape(tf.concat([tf.sin(emb), tf.cos(emb)], axis=1), (num_positions, -1))
+    if embedding_dim % 2 == 1:
+        # zero pad
+        emb = tf.concat([emb, tf.zeros((num_positions, 1))], axis=1)
+    if padding_idx is not None:
+        _padding_mask = tf.concat(
+            [
+                tf.ones((padding_idx, shape_list(emb)[1])),
+                tf.zeros((1, shape_list(emb)[1])),
+                tf.ones((shape_list(emb)[0] - padding_idx - 1, shape_list(emb)[1])),
+            ],
+            axis=0,
+        )
+        emb *= _padding_mask
+
+    return tf.constant(emb, name="embed_positions")
+
+
+def _create_position_ids_from_input_ids(
+    input_ids: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
+) -> tf.Tensor:
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = tf.where(input_ids != padding_idx, 1, 0)
+    incremental_indices = (tf.cast(tf.cumsum(mask, axis=1), dtype=mask.dtype) + past_key_values_length) * mask
+    return tf.cast(incremental_indices, dtype=tf.int64) + padding_idx
+
+
+def _create_position_ids_from_inputs_embeds(
+    inputs_embeds: tf.Tensor, past_key_values_length: int, padding_idx: Optional[int]
+) -> tf.Tensor:
+    """
+    Args:
+    We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        inputs_embeds: tf.Tensor
+    Returns: tf.Tensor
+    """
+    input_shape = shape_list(inputs_embeds)[:-1]
+    sequence_length = input_shape[1]
+
+    position_ids = tf.range(padding_idx + 1, sequence_length + padding_idx + 1, dtype=tf.int64)
+
+    return tf.broadcast_to(tf.expand_dims(position_ids, axis=0), input_shape) + past_key_values_length
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz = input_ids_shape[0]
+    tgt_len = input_ids_shape[1]
+    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
+    mask_cond = tf.range(shape_list(mask)[-1])
+
+    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
+
+    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->XGLM
+class TFXGLMAttention(keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: Tuple[Tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+class TFXGLMDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: XGLMConfig, **kwargs: Any) -> None:
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFXGLMAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            name="self_attn",
+        )
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        if config.add_cross_attention:
+            self.encoder_attn = TFXGLMAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+                name="encoder_attn",
+            )
+            self.encoder_attn_layer_norm = keras.layers.LayerNormalization(
+                epsilon=1e-5, name="encoder_attn_layer_norm"
+            )
+
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.fc1 = keras.layers.Dense(config.ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    # Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer.call
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: Tuple[tf.Tensor] | None = None,
+        training: Optional[bool] = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`tf.Tensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`tf.Tensor`):
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                *(decoder_attention_heads,)*
+            cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+                *(decoder_attention_heads,)*
+            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            hidden_states = self.dropout(hidden_states, training=training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            cross_attn_weights,
+            present_key_value,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+
+
+@keras_serializable
+class TFXGLMMainLayer(keras.layers.Layer):
+    config_class = XGLMConfig
+
+    def __init__(
+        self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs, **kwargs: Any
+    ) -> None:
+        super().__init__(*inputs, **kwargs)
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = TFSharedEmbeddings(
+                config.vocab_size, config.d_model, self.padding_idx, name="embed_tokens"
+            )
+
+        self.offset = 2
+        self._embed_positions_weights = create_sinusoidal_positions(
+            num_positions=config.max_position_embeddings + self.offset,
+            embedding_dim=config.d_model,
+            padding_idx=config.pad_token_id,
+        )
+
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.layers = [TFXGLMDecoderLayer(config, name=f"layers.{i}") for i in range(config.num_layers)]
+        self.layerdrop = config.layerdrop
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+    def get_input_embeddings(self) -> TFSharedEmbeddings:
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value: TFSharedEmbeddings) -> None:
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(
+        self,
+        attention_mask: tf.Tensor | None,
+        input_shape: tf.TensorShape,
+        past_key_values_length: int,
+    ) -> tf.Tensor:
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length)
+        combined_attention_mask = tf.cond(
+            input_shape[-1] > 1, lambda: combined_attention_mask, lambda: tf.ones_like(combined_attention_mask)
+        )
+        if attention_mask is None:
+            return combined_attention_mask
+        expand_attention_mask = _expand_mask(attention_mask, tgt_len=input_shape[-1])
+        return expand_attention_mask + combined_attention_mask
+
+    def embed_positions(self, position_ids: np.ndarray | tf.Tensor | None = None) -> tf.Tensor:
+        position_ids += self.offset
+        positions = tf.gather(self._embed_positions_weights, position_ids, axis=0)
+        return positions
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs: Any,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = tf.shape(input_ids)
+            input_ids = tf.reshape(input_ids, (-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(
+                tf.range(past_key_values_length, input_shape[-1] + past_key_values_length), axis=0
+            )
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.vocab_size)
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(position_ids)
+
+        hidden_states = tf.cast(inputs_embeds, dtype=tf.float32) + positions
+
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired
+        for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]:
+            if attn_mask is not None:
+                tf.debugging.assert_equal(
+                    shape_list(attn_mask)[0],
+                    len(self.layers),
+                    message=(
+                        f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for"
+                        f" {shape_list(attn_mask)[0]}."
+                    ),
+                )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_value=past_key_value,
+            )
+
+            if use_cache:
+                next_decoder_cache += (present_key_value,)
+
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_cross_attn,)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFXGLMPreTrainedModel(TFPreTrainedModel):
+    config_class = XGLMConfig
+    base_model_prefix = "model"
+
+
+XGLM_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`XGLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XGLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`tf.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.num_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XGLM_START_DOCSTRING,
+)
+class TFXGLMModel(TFXGLMPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`TFXGLMDecoderLayer`]
+
+    Args:
+        config: XGLMConfig
+        embed_tokens: [TFSharedEmbeddings]: output embedding
+    """
+
+    def __init__(
+        self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any
+    ) -> None:
+        super().__init__(config, *inputs, **kwargs)
+
+        self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs: Any,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+@add_start_docstrings(
+    """
+    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    XGLM_START_DOCSTRING,
+)
+class TFXGLMForCausalLM(TFXGLMPreTrainedModel, TFCausalLanguageModelingLoss):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"model.embed_positions.weights",
+        r"lm_head.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        r"model.embed_positions.weights",
+    ]
+
+    def __init__(
+        self, config: XGLMConfig, embed_tokens: Optional[TFSharedEmbeddings] = None, *inputs: Any, **kwargs: Any
+    ) -> None:
+        super().__init__(config, *inputs, **kwargs)
+
+        self.model = TFXGLMMainLayer(config, embed_tokens=embed_tokens, name="model")
+        self.lm_head = keras.layers.Dense(
+            config.vocab_size,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.init_std),
+            name="lm_head",
+        )
+        self.config = config
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+        **kwargs: Any,
+    ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
+        r"""
+        labels (`np.ndarray` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            labels = tf.concat(
+                [labels[:, 1:], tf.fill((labels.shape[0], 1), tf.cast(-100, labels.dtype))],
+                axis=-1,
+            )
+            loss = self.hf_compute_loss(labels, lm_logits)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build([None, None, self.config.hidden_size])
+
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "lm_head.weight":
+            return tf_weight, "model.embed_tokens.weight"
+        else:
+            return (tf_weight,)
+
+
+__all__ = ["TFXGLMForCausalLM", "TFXGLMModel", "TFXGLMPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/modeling_xglm.py b/docs/transformers/build/lib/transformers/models/xglm/modeling_xglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb91b46f36cdbebf6852e4d2b1f1f235c2a48c4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/modeling_xglm.py
@@ -0,0 +1,813 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XGLM model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ...modeling_utils import PreTrainedModel
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_xglm import XGLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "facebook/xglm-564M"
+_CONFIG_FOR_DOC = "XGLMConfig"
+
+
+XGLM_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XGLMConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XGLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of
+            the decoder.
+        encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+            Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+            selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_layers, attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->XGLM
+class XGLMScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+class XGLMSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, position_ids: Optional[torch.Tensor] = None, past_key_values_length: int = 0):
+        bsz, seq_len = position_ids.size()
+        position_ids += self.offset
+
+        # Expand embeddings if needed. `position_ids.max()` is NOT used to keep torch.fx compatibility.
+        max_pos = 2 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+
+class XGLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
+        if attn_weights.dtype == torch.float16:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
+        else:
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class XGLMDecoderLayer(nn.Module):
+    def __init__(self, config: XGLMConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = XGLMAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        if config.add_cross_attention:
+            self.encoder_attn = XGLMAttention(
+                embed_dim=self.embed_dim,
+                num_heads=config.attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim)
+        self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class XGLMPreTrainedModel(PreTrainedModel):
+    config_class = XGLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["XGLMDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+@add_start_docstrings(
+    "The bare XGLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XGLM_START_DOCSTRING,
+)
+class XGLMModel(XGLMPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_layers* layers. Each layer is a [`XGLMDecoderLayer`]
+
+    Args:
+        config: XGLMConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: XGLMConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = XGLMScaledWordEmbedding(
+                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_positions = XGLMSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            config.pad_token_id,
+        )
+        self.layers = nn.ModuleList([XGLMDecoderLayer(config) for _ in range(config.num_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_key_values_length,
+                input_shape[-1] + past_key_values_length,
+                dtype=torch.long,
+                device=input_ids.device if input_ids is not None else inputs_embeds.device,
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        hidden_states = inputs_embeds + self.embed_positions(position_ids, past_key_values_length).to(
+            inputs_embeds.device
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=float(self.dropout), training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing`. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The XGLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    XGLM_START_DOCSTRING,
+)
+class XGLMForCausalLM(XGLMPreTrainedModel, GenerationMixin):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = XGLMModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XGLM_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                pad_token_id=self.config.pad_token_id,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+__all__ = ["XGLMForCausalLM", "XGLMModel", "XGLMPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm.py b/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54191a57f57f38167c081f783edf17fd5e29316
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for ."""
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+
+@requires(backends=("sentencepiece",))
+class XGLMTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        # Compatibility with the original tokenizer
+        self.num_madeup_words = 7
+        madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]
+
+        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
+        kwargs["additional_special_tokens"] += [
+            word for word in madeup_words if word not in kwargs["additional_special_tokens"]
+        ]
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        sp_size = len(self.sp_model)
+        madeup_words = {f"<madeupword{i}>": sp_size + i + self.fairseq_offset for i in range(self.num_madeup_words)}
+        self.fairseq_tokens_to_ids.update(madeup_words)
+
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.sep_token_id] + token_ids_0
+        sep = [self.sep_token_id]
+        return sep + token_ids_0 + sep + sep + token_ids_1
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1))
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(sep + token_ids_0) * [0]
+        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + self.num_madeup_words
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XGLMTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm_fast.py b/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..92d99d2f863dc0854f0b35ce1368c6d5526a66b9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xglm/tokenization_xglm_fast.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for XGLM."""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_xglm import XGLMTokenizer
+else:
+    XGLMTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+
+class XGLMTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
+    and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = XGLMTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        **kwargs,
+    ):
+        # Compatibility with the original tokenizer
+        self.num_madeup_words = 7
+        madeup_words = [f"<madeupword{i}>" for i in range(self.num_madeup_words)]
+
+        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
+        kwargs["additional_special_tokens"] += [
+            word for word in madeup_words if word not in kwargs["additional_special_tokens"]
+        ]
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.sep_token_id] + token_ids_0
+        sep = [self.sep_token_id]
+        return sep + token_ids_0 + sep + sep + token_ids_1
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(sep + token_ids_0) * [0]
+        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XGLMTokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm/__init__.py b/docs/transformers/build/lib/transformers/models/xlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1167fc93a10101636288751b692a4daf8a0cf2b4
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlm import *
+    from .modeling_tf_xlm import *
+    from .modeling_xlm import *
+    from .tokenization_xlm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xlm/configuration_xlm.py b/docs/transformers/build/lib/transformers/models/xlm/configuration_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd2c290f38fe977b9231c3acb69864ac6632051
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/configuration_xlm.py
@@ -0,0 +1,241 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLMConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`XLMModel`] or a [`TFXLMModel`]. It is used to
+    instantiate a XLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [FacebookAI/xlm-mlm-en-2048](https://huggingface.co/FacebookAI/xlm-mlm-en-2048) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30145):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XLMModel`] or [`TFXLMModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use *gelu* for the activations instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
+            order to only attend to the left-side context instead if a bidirectional context.
+        asm (`bool`, *optional*, defaults to `False`):
+            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (`int`, *optional*, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information
+            on how to use them.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
+        init_std (`int`, *optional*, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
+            embedding matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (`int`, *optional*, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (`int`, *optional*, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (`int`, *optional*, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (`int`, *optional*, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (`int`, *optional*, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(`bool`, *optional*, defaults to `True`):
+            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (`string`, *optional*, defaults to "first"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        mask_token_id (`int`, *optional*, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (`int`, *optional*, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMConfig, XLMModel
+
+    >>> # Initializing a XLM configuration
+    >>> configuration = XLMConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = XLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlm"
+    attribute_map = {
+        "hidden_size": "emb_dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+        "n_words": "vocab_size",  # For backward compatibility
+    }
+
+    def __init__(
+        self,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048**-0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        pad_token_id=2,
+        bos_token_id=0,
+        **kwargs,
+    ):
+        """Constructs XLMConfig."""
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
+
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
+
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+
+
+# Copied from transformers.models.bert.configuration_bert.BertOnnxConfig
+class XLMOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["XLMConfig", "XLMOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e5a17921d01ca1d1fdea67bcc5e84dcae5e1171
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/convert_xlm_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+import argparse
+import json
+
+import numpy
+import torch
+
+from transformers.models.xlm.tokenization_xlm import VOCAB_FILES_NAMES
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
+
+
+logging.set_verbosity_info()
+
+
+def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
+    # Load checkpoint
+    chkpt = torch.load(xlm_checkpoint_path, map_location="cpu", weights_only=True)
+
+    state_dict = chkpt["model"]
+
+    # We have the base model one level deeper than the original XLM repository
+    two_levels_state_dict = {}
+    for k, v in state_dict.items():
+        if "pred_layer" in k:
+            two_levels_state_dict[k] = v
+        else:
+            two_levels_state_dict["transformer." + k] = v
+
+    config = chkpt["params"]
+    config = {n: v for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))}
+
+    vocab = chkpt["dico_word2id"]
+    vocab = {s + "</w>" if s.find("@@") == -1 and i > 13 else s.replace("@@", ""): i for s, i in vocab.items()}
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES["vocab_file"]
+
+    print(f"Save PyTorch model to {pytorch_weights_dump_path}")
+    torch.save(two_levels_state_dict, pytorch_weights_dump_path)
+
+    print(f"Save configuration file to {pytorch_config_dump_path}")
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(config, indent=2) + "\n")
+
+    print(f"Save vocab file to {pytorch_config_dump_path}")
+    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(vocab, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--xlm_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
diff --git a/docs/transformers/build/lib/transformers/models/xlm/modeling_tf_xlm.py b/docs/transformers/build/lib/transformers/models/xlm/modeling_tf_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d161778c07d88c57f9964770fbc569f9f9095f33
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/modeling_tf_xlm.py
@@ -0,0 +1,1357 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 XLM model.
+"""
+
+from __future__ import annotations
+
+import itertools
+import warnings
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_xlm import XLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-mlm-en-2048"
+_CONFIG_FOR_DOC = "XLMConfig"
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = shape_list(lengths)[0]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        # assert lengths.max().item() <= slen
+        alen = tf.range(slen, dtype=lengths.dtype)
+        mask = alen < tf.expand_dims(lengths, axis=1)
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = tf.less_equal(
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
+        )
+    else:
+        attn_mask = mask
+
+    # sanity check
+    # assert shape_list(mask) == [bs, slen]
+    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
+    if causal:
+        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
+
+    return mask, attn_mask
+
+
+class TFXLMMultiHeadAttention(keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_id = next(TFXLMMultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
+        self.dropout = keras.layers.Dropout(config.attention_dropout)
+        self.pruned_heads = set()
+        self.dim = dim
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = shape_list(kv)[1]
+
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        dim_per_head = self.dim // self.n_heads
+        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """projection"""
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """compute context"""
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+
+            cache[self.layer_id] = (k, v)
+
+        f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype)
+        q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head))  # (bs, n_heads, qlen, dim_per_head)
+        k = tf.cast(k, dtype=q.dtype)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
+        mask = tf.cast(mask, dtype=scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+        outputs = (self.out_lin(context),)
+
+        if output_attentions:
+            outputs = outputs + (weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_lin", None) is not None:
+            with tf.name_scope(self.q_lin.name):
+                self.q_lin.build([None, None, self.dim])
+        if getattr(self, "k_lin", None) is not None:
+            with tf.name_scope(self.k_lin.name):
+                self.k_lin.build([None, None, self.dim])
+        if getattr(self, "v_lin", None) is not None:
+            with tf.name_scope(self.v_lin.name):
+                self.v_lin.build([None, None, self.dim])
+        if getattr(self, "out_lin", None) is not None:
+            with tf.name_scope(self.out_lin.name):
+                self.out_lin.build([None, None, self.dim])
+
+
+class TFXLMTransformerFFN(keras.layers.Layer):
+    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
+        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.in_dim = in_dim
+        self.dim_hidden = dim_hidden
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.in_dim])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.dim_hidden])
+
+
+@keras_serializable
+class TFXLMMainLayer(keras.layers.Layer):
+    config_class = XLMConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.return_dict = config.use_return_dict
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim  # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads  # 8 by default
+        self.n_layers = config.n_layers
+        self.max_position_embeddings = config.max_position_embeddings
+        self.embed_init_std = config.embed_init_std
+        if self.dim % self.n_heads != 0:
+            raise ValueError("transformer dim must be a multiple of n_heads")
+
+        # embeddings
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.attention_dropout = keras.layers.Dropout(config.attention_dropout)
+
+        if config.sinusoidal_embeddings:
+            raise NotImplementedError
+            # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )  # padding_idx=self.pad_index)
+        self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
+
+        # transformer layers
+        self.attentions = []
+        self.layer_norm1 = []
+        self.ffns = []
+        self.layer_norm2 = []
+        # if self.is_decoder:
+        #     self.layer_norm15 = []
+        #     self.encoder_attn = []
+
+        for i in range(self.n_layers):
+            self.attentions.append(
+                TFXLMMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
+            )
+            self.layer_norm1.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
+            )
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(
+                TFXLMTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
+            )
+            self.layer_norm2.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
+            )
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.dim],
+                initializer=get_initializer(self.embed_init_std),
+            )
+
+        if self.n_langs > 1 and self.use_lang_emb:
+            with tf.name_scope("lang_embeddings"):
+                self.lang_embeddings = self.add_weight(
+                    name="embeddings",
+                    shape=[self.n_langs, self.dim],
+                    initializer=get_initializer(self.embed_init_std),
+                )
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "layer_norm_emb", None) is not None:
+            with tf.name_scope(self.layer_norm_emb.name):
+                self.layer_norm_emb.build([None, None, self.dim])
+        for layer in self.attentions:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm1:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+        for layer in self.ffns:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm2:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        langs=None,
+        token_type_ids=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        training=False,
+    ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]:
+        # removed: src_enc=None, src_len=None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            bs, slen = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            bs, slen = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = tf.reduce_sum(
+                    tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1
+                )
+            else:
+                lengths = tf.convert_to_tensor([slen] * bs)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        # assert shape_list(lengths)[0] == bs
+        (
+            tf.debugging.assert_equal(shape_list(lengths)[0], bs),
+            f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched",
+        )
+        # assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(slen), axis=0)
+            position_ids = tf.tile(position_ids, (bs, 1))
+
+        # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+        (
+            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]),
+            f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched",
+        )
+        # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            (
+                tf.debugging.assert_equal(shape_list(langs), [bs, slen]),
+                f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched",
+            )
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
+
+        if langs is not None and self.use_lang_emb and self.n_langs > 1:
+            tensor = tensor + tf.gather(self.lang_embeddings, langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor, training=training)
+        mask = tf.cast(mask, dtype=tensor.dtype)
+        tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        for i in range(self.n_layers):
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            attn_outputs = self.attentions[i](
+                tensor,
+                attn_mask,
+                None,
+                cache,
+                head_mask[i],
+                output_attentions,
+                training=training,
+            )
+            attn = attn_outputs[0]
+
+            if output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+
+            attn = self.dropout(attn, training=training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+        return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+class TFXLMPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMConfig
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        # Sometimes XLM has language embeddings so don't forget to build them as well if needed
+        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
+        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": inputs_list,
+                "attention_mask": attns_list,
+                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
+            }
+        else:
+            return {"input_ids": inputs_list, "attention_mask": attns_list}
+
+
+# Remove when XLMWithLMHead computes loss like other LM models
+@dataclass
+class TFXLMWithLMHeadModelOutput(ModelOutput):
+    """
+    Base class for [`TFXLMWithLMHeadModel`] outputs.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: Optional[tf.Tensor] = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+XLM_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`XLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        langs (`tf.Tensor` or `Numpy array` of shape `({0})`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`Dict[str, tf.Tensor]`, *optional*):
+            Dictionary string to `tf.Tensor` that contains precomputed hidden states (key and values in the attention
+            blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential decoding.
+
+            The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+)
+class TFXLMModel(TFXLMPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        langs: tf.Tensor | None = None,
+        token_type_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        lengths: tf.Tensor | None = None,
+        cache: Dict[str, tf.Tensor] | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutput | Tuple[tf.Tensor]:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+class TFXLMPredLayer(keras.layers.Layer):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+
+        if config.asm is False:
+            self.input_embeddings = input_embeddings
+        else:
+            raise NotImplementedError
+            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+            #     in_features=dim,
+            #     n_classes=config.n_words,
+            #     cutoffs=config.asm_cutoffs,
+            #     div_value=config.asm_div_value,
+            #     head_bias=True,  # default is False
+            # )
+
+    def build(self, input_shape):
+        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
+        # XLM does not have past caching features
+        self.supports_xla_generation = False
+
+    def get_lm_head(self):
+        return self.pred_layer
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.pred_layer.name
+
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = inputs.shape[0]
+        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
+        inputs = tf.concat([inputs, mask_token], axis=1)
+
+        if lang_id is not None:
+            langs = tf.ones_like(inputs) * lang_id
+        else:
+            langs = None
+        return {"input_ids": inputs, "langs": langs}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLMWithLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFXLMWithLMHeadModelOutput, Tuple[tf.Tensor]]:
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output)
+
+        if not return_dict:
+            return (outputs,) + transformer_outputs[1:]
+
+        return TFXLMWithLMHeadModelOutput(
+            logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "pred_layer", None) is not None:
+            with tf.name_scope(self.pred_layer.name):
+                self.pred_layer.build(None)
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+
+        logits = self.sequence_summary(output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+        self.logits_proj = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        # Sometimes XLM has language embeddings so don't forget to build them as well if needed
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+        else:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
+                "attention mask instead.",
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_langs,
+            flat_token_type_ids,
+            flat_position_ids,
+            lengths,
+            cache,
+            head_mask,
+            flat_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.num_labels])
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer
+    on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_START_DOCSTRING,
+)
+class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLMMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: Optional[Dict[str, tf.Tensor]] = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFXLMForMultipleChoice",
+    "TFXLMForQuestionAnsweringSimple",
+    "TFXLMForSequenceClassification",
+    "TFXLMForTokenClassification",
+    "TFXLMMainLayer",
+    "TFXLMModel",
+    "TFXLMPreTrainedModel",
+    "TFXLMWithLMHeadModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlm/modeling_xlm.py b/docs/transformers/build/lib/transformers/models/xlm/modeling_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f1d4ec3ff5031780afa97929b67e55f719efae
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/modeling_xlm.py
@@ -0,0 +1,1698 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch XLM model.
+"""
+
+import itertools
+import math
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import gelu, get_activation
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_xlm import XLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-mlm-en-2048"
+_CONFIG_FOR_DOC = "XLMConfig"
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out.requires_grad = False
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+@dataclass
+class XLMSquadHeadOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models using a [`~modeling_utils.XLMSQuADHead`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+
+
+class XLMPoolerStartLogits(nn.Module):
+    """
+    Compute SQuAD start logits from sequence hidden states.
+
+    Args:
+        config ([`XLMConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        Returns:
+            `torch.FloatTensor`: The start logits for SQuAD.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class XLMPoolerEndLogits(nn.Module):
+    """
+    Compute SQuAD end logits from sequence hidden states.
+
+    Args:
+        config ([`XLMConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The end logits for SQuAD.
+        """
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class XLMPoolerAnswerClass(nn.Module):
+    """
+    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
+
+    Args:
+        config ([`XLMConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
+        """
+        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+class XLMSQuADHead(nn.Module):
+    r"""
+    A SQuAD head inspired by XLNet.
+
+    Args:
+        config ([`XLMConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = XLMPoolerStartLogits(config)
+        self.end_logits = XLMPoolerEndLogits(config)
+        self.answer_class = XLMPoolerAnswerClass(config)
+
+    @replace_return_docstrings(output_type=XLMSquadHeadOutput, config_class=XLMConfig)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+        is_impossible: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = False,
+    ) -> Union[XLMSquadHeadOutput, Tuple[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                Final hidden states of the model on the sequence tokens.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Positions of the first token for the labeled span.
+            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Positions of the last token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Whether the question has a possible answer in the paragraph or not.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+            return_dict (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+        """
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            return XLMSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            if not return_dict:
+                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+            else:
+                return XLMSquadHeadOutput(
+                    start_top_log_probs=start_top_log_probs,
+                    start_top_index=start_top_index,
+                    end_top_log_probs=end_top_log_probs,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+
+
+class XLMSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`XLMConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+class MultiHeadAttention(nn.Module):
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config):
+        super().__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, input, mask, kv=None, cache=None, head_mask=None, output_attentions=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = kv.size(1)
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """projection"""
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """compute context"""
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, torch.finfo(scores.dtype).min)  # (bs, n_heads, qlen, klen)
+
+        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+class TransformerFFN(nn.Module):
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super().__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else nn.functional.relu
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+    def forward(self, input):
+        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
+
+    def ff_chunk(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+class XLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMConfig
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, XLMModel) and self.config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(
+                self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
+            )
+
+
+@dataclass
+class XLMForQuestionAnsweringOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models using a `XLMSQuADHead`.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+XLM_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XLMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        langs (`torch.LongTensor` of shape `({0})`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`Dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary string to `torch.FloatTensor` that contains precomputed hidden states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding.
+
+            The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_START_DOCSTRING,
+)
+class XLMModel(XLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim  # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads  # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
+
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+
+        for _ in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # Dummy kwargs for now
+    ) -> Union[Tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.tensor([slen] * bs, device=device)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = self.position_ids[:, :slen]
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.n_layers)
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
+        if langs is not None and self.use_lang_emb and self.n_langs > 1:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        for i in range(self.n_layers):
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            attn_outputs = self.attentions[i](
+                tensor,
+                attn_mask,
+                cache=cache,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+            attn = attn_outputs[0]
+            if output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+            attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+class XLMPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y=None):
+        """Compute the loss, and optionally the scores."""
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    The XLM Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMWithLMHeadModel(XLMPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["pred_layer.proj.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = XLMModel(config)
+        self.pred_layer = XLMPredLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.pred_layer.proj
+
+    def set_output_embeddings(self, new_embeddings):
+        self.pred_layer.proj = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- this model uses config options to prepare inputs
+
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = input_ids.shape[0]
+        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, mask_token], dim=1)
+        if lang_id is not None:
+            langs = torch.full_like(input_ids, lang_id)
+        else:
+            langs = None
+        return {"input_ids": input_ids, "langs": langs}
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<special1>",
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)  # (loss, logits) or (logits,) depending on if labels are provided.
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return MaskedLMOutput(
+            loss=outputs[0] if labels is not None else None,
+            logits=outputs[0] if labels is None else outputs[1],
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMForSequenceClassification(XLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.transformer = XLMModel(config)
+        self.sequence_summary = XLMSequenceSummary(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = XLMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMForQuestionAnswering(XLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = XLMModel(config)
+        self.qa_outputs = XLMSQuADHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=XLMForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        is_impossible: Optional[torch.Tensor] = None,
+        cls_index: Optional[torch.Tensor] = None,
+        p_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, XLMForQuestionAnsweringOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
+            masked. 0.0 mean token is not masked.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMForQuestionAnswering
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = XLMForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        ... )  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return XLMForQuestionAnsweringOutput(
+            loss=outputs.loss,
+            start_top_log_probs=outputs.start_top_log_probs,
+            start_top_index=outputs.start_top_index,
+            end_top_log_probs=outputs.end_top_log_probs,
+            end_top_index=outputs.end_top_index,
+            cls_logits=outputs.cls_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMForTokenClassification(XLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLMModel(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_START_DOCSTRING,
+)
+class XLMForMultipleChoice(XLMPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = XLMModel(config)
+        self.sequence_summary = XLMSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.num_labels, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[Dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
+                "attention mask instead."
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "XLMForMultipleChoice",
+    "XLMForQuestionAnswering",
+    "XLMForQuestionAnsweringSimple",
+    "XLMForSequenceClassification",
+    "XLMForTokenClassification",
+    "XLMModel",
+    "XLMPreTrainedModel",
+    "XLMWithLMHeadModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlm/tokenization_xlm.py b/docs/transformers/build/lib/transformers/models/xlm/tokenization_xlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..14387279708ef85182c8a55a0fd1007c299229f5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm/tokenization_xlm.py
@@ -0,0 +1,606 @@
+# coding=utf-8
+# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for XLM."""
+
+import json
+import os
+import re
+import sys
+import unicodedata
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def lowercase_and_remove_accent(text):
+    """
+    Lowercase and strips accents from a piece of text based on
+    https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
+    """
+    text = " ".join(text)
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat == "Mn":
+            continue
+        output.append(char)
+    return "".join(output).lower().split(" ")
+
+
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+def romanian_preprocessing(text):
+    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `FacebookAI/xlm-mlm-enro-1024`"""
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
+    text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
+    text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
+    text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
+    text = text.replace("\u0102", "A").replace("\u0103", "a")
+    text = text.replace("\u00c2", "A").replace("\u00e2", "a")
+    text = text.replace("\u00ce", "I").replace("\u00ee", "i")
+    return text
+
+
+class XLMTokenizer(PreTrainedTokenizer):
+    """
+    Construct an XLM tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+
+    - Moses preprocessing and tokenization for most supported languages.
+    - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP).
+    - Optionally lowercases and normalizes all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The `lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically set
+      for pretrained vocabularies).
+    - The `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Vocabulary file.
+        merges_file (`str`):
+            Merges file.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
+            List of additional special tokens.
+        lang2id (`Dict[str, int]`, *optional*):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (`Dict[int, str]`, *optional*):
+            Dictionary mapping language IDs to their string identifiers.
+        do_lowercase_and_remove_accent (`bool`, *optional*, defaults to `True`):
+            Whether to lowercase and remove accents when tokenizing.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        do_lowercase_and_remove_accent=True,
+        **kwargs,
+    ):
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
+        # True for current supported model (v1.2.0), False for XLM-17 & 100
+        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
+                    " (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
+        """
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizer.
+        Otherwise, we use Moses.
+
+        Details of tokenization:
+
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+            - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
+            - Install with `pip install pythainlp`
+            - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of
+              [KyTea](https://github.com/neubig/kytea)
+            - Install with the following steps:
+
+            ::
+
+                git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local
+                make && make install pip install kytea
+
+            - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
+            - Install with `pip install jieba`
+
+        (*) The original XLM used [Stanford
+        Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
+        (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. Jieba is a lot
+        faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine if you
+        fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
+        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence
+        externally, and set `bypass_tokenizer=True` to bypass the tokenizer.
+
+        Args:
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
+              languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by"
+                " the loaded pretrained model."
+            )
+        if bypass_tokenizer:
+            text = text.split()
+        elif lang not in self.lang_with_custom_tokenizer:
+            text = self.moses_pipeline(text, lang=lang)
+            # TODO: make sure we are using `FacebookAI/xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
+            if lang == "ro":
+                text = romanian_preprocessing(text)
+            text = self.moses_tokenize(text, lang=lang)
+        elif lang == "th":
+            text = self.moses_pipeline(text, lang=lang)
+            try:
+                if "pythainlp" not in sys.modules:
+                    from pythainlp.tokenize import word_tokenize as th_word_tokenize
+                else:
+                    th_word_tokenize = sys.modules["pythainlp"].word_tokenize
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
+                )
+                logger.error("1. pip install pythainlp")
+                raise
+            text = th_word_tokenize(text)
+        elif lang == "zh":
+            try:
+                if "jieba" not in sys.modules:
+                    import jieba
+                else:
+                    jieba = sys.modules["jieba"]
+            except (AttributeError, ImportError):
+                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
+                logger.error("1. pip install jieba")
+                raise
+            text = " ".join(jieba.cut(text))
+            text = self.moses_pipeline(text, lang=lang)
+            text = text.split()
+        elif lang == "ja":
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.ja_tokenize(text)
+        else:
+            raise ValueError("It should not reach here")
+
+        if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
+            text = lowercase_and_remove_accent(text)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
+        pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+
+__all__ = ["XLMTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/__init__.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e684c6c9b2c2c261fb4e6ad6998daa9bea6191e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlm_roberta import *
+    from .modeling_flax_xlm_roberta import *
+    from .modeling_tf_xlm_roberta import *
+    from .modeling_xlm_roberta import *
+    from .tokenization_xlm_roberta import *
+    from .tokenization_xlm_roberta_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/configuration_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b17f33ed600632ff2e0625646b3387db3b76eeb
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM-RoBERTa configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLMRobertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMRobertaModel`] or a [`TFXLMRobertaModel`]. It
+    is used to instantiate a XLM-RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the XLMRoBERTa
+    [FacebookAI/xlm-roberta-base](https://huggingface.co/FacebookAI/xlm-roberta-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the XLM-RoBERTa model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMRobertaModel`] or [`TFXLMRobertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaModel`] or
+            [`TFXLMRobertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMRobertaConfig, XLMRobertaModel
+
+    >>> # Initializing a XLM-RoBERTa FacebookAI/xlm-roberta-base style configuration
+    >>> configuration = XLMRobertaConfig()
+
+    >>> # Initializing a model (with random weights) from the FacebookAI/xlm-roberta-base style configuration
+    >>> model = XLMRobertaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlm-roberta"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRoberta
+class XLMRobertaOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["XLMRobertaConfig", "XLMRobertaOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..63432be06ddd91bb77021a3502ac6cf8f8d1cde8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_flax_xlm_roberta.py
@@ -0,0 +1,1511 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax XLM-RoBERTa model."""
+
+from typing import Callable, Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_xlm_roberta import XLMRobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
+remat = nn_partitioning.remat
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        input_ids: jnp.ndarray
+        padding_idx: int
+
+    Returns: jnp.ndarray
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).astype("i4")
+
+    if mask.ndim > 2:
+        mask = mask.reshape((-1, mask.shape[-1]))
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+        incremental_indices = incremental_indices.reshape(input_ids.shape)
+    else:
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+
+    return incremental_indices.astype("i4") + padding_idx
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->XLMRoberta
+class FlaxXLMRobertaEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->XLMRoberta
+class FlaxXLMRobertaSelfAttention(nn.Module):
+    config: XLMRobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->XLMRoberta
+class FlaxXLMRobertaSelfOutput(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->XLMRoberta
+class FlaxXLMRobertaAttention(nn.Module):
+    config: XLMRobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.self = FlaxXLMRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxXLMRobertaSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->XLMRoberta
+class FlaxXLMRobertaIntermediate(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->XLMRoberta
+class FlaxXLMRobertaOutput(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->XLMRoberta
+class FlaxXLMRobertaLayer(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxXLMRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxXLMRobertaIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxXLMRobertaOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxXLMRobertaAttention(self.config, causal=False, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->XLMRoberta
+class FlaxXLMRobertaLayerCollection(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxXLMRobertaCheckpointLayer = remat(FlaxXLMRobertaLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxXLMRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxXLMRobertaLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->XLMRoberta
+class FlaxXLMRobertaEncoder(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.layer = FlaxXLMRobertaLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->XLMRoberta
+class FlaxXLMRobertaPooler(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return nn.tanh(cls_hidden_state)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead with Roberta->XLMRoberta
+class FlaxXLMRobertaLMHead(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.decoder = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead with Roberta->XLMRoberta
+class FlaxXLMRobertaClassificationHead(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.out_proj = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = nn.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel with Roberta->XLMRoberta, roberta->xlm-roberta, ROBERTA->XLM_ROBERTA
+class FlaxXLMRobertaPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaConfig
+    base_model_prefix = "xlm-roberta"
+
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: XLMRobertaConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.ones_like(input_ids)
+        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: dict = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        if position_ids is None:
+            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxXLMRobertaAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->XLMRoberta
+class FlaxXLMRobertaModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.embeddings = FlaxXLMRobertaEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxXLMRobertaEncoder(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.pooler = FlaxXLMRobertaPooler(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class FlaxXLMRobertaModel(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaModule
+
+
+append_call_sample_docstring(FlaxXLMRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForMaskedLMModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
+class FlaxXLMRobertaForMaskedLM(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForMaskedLMModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForMaskedLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPooling,
+    _CONFIG_FOR_DOC,
+    mask="<mask>",
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForSequenceClassificationModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.classifier = FlaxXLMRobertaClassificationHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class FlaxXLMRobertaForSequenceClassification(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForMultipleChoiceModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class FlaxXLMRobertaForMultipleChoice(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForMultipleChoiceModule
+
+
+overwrite_call_docstring(
+    FlaxXLMRobertaForMultipleChoice, XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxXLMRobertaForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForTokenClassificationModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class FlaxXLMRobertaForTokenClassification(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule with Bert->XLMRoberta, with self.bert->self.roberta
+class FlaxXLMRobertaForQuestionAnsweringModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+class FlaxXLMRobertaForQuestionAnswering(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule with Roberta->XLMRoberta
+class FlaxXLMRobertaForCausalLMModule(nn.Module):
+    config: XLMRobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta = FlaxXLMRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxXLMRobertaLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM with Roberta->XLMRoberta
+class FlaxXLMRobertaForCausalLM(FlaxXLMRobertaPreTrainedModel):
+    module_class = FlaxXLMRobertaForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxXLMRobertaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxXLMRobertaForMaskedLM",
+    "FlaxXLMRobertaForCausalLM",
+    "FlaxXLMRobertaForMultipleChoice",
+    "FlaxXLMRobertaForQuestionAnswering",
+    "FlaxXLMRobertaForSequenceClassification",
+    "FlaxXLMRobertaForTokenClassification",
+    "FlaxXLMRobertaModel",
+    "FlaxXLMRobertaPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f4e9d006c3419a2dde10cc3252d688a4a1487a6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_tf_xlm_roberta.py
@@ -0,0 +1,1791 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 XLM-RoBERTa model."""
+
+from __future__ import annotations
+
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_xlm_roberta import XLMRobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->XLMRoberta
+class TFXLMRobertaEmbeddings(keras.layers.Layer):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding_idx = 1
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            input_ids: tf.Tensor
+        Returns: tf.Tensor
+        """
+        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
+        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
+
+        return incremental_indices + self.padding_idx
+
+    def call(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+        training=False,
+    ):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids=input_ids, past_key_values_length=past_key_values_length
+                )
+            else:
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta
+class TFXLMRobertaPooler(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
+class TFXLMRobertaSelfAttention(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: Tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFXLMRobertaModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
+class TFXLMRobertaSelfOutput(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
+class TFXLMRobertaAttention(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFXLMRobertaSelfAttention(config, name="self")
+        self.dense_output = TFXLMRobertaSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: Tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
+class TFXLMRobertaIntermediate(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
+class TFXLMRobertaOutput(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
+class TFXLMRobertaLayer(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFXLMRobertaAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFXLMRobertaAttention(config, name="crossattention")
+        self.intermediate = TFXLMRobertaIntermediate(config, name="intermediate")
+        self.bert_output = TFXLMRobertaOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: Tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> Tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->XLMRoberta
+class TFXLMRobertaEncoder(keras.layers.Layer):
+    def __init__(self, config: XLMRobertaConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFXLMRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: Tuple[Tuple[tf.Tensor]] | None,
+        use_cache: Optional[bool],
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->XLMRoberta
+class TFXLMRobertaMainLayer(keras.layers.Layer):
+    config_class = XLMRobertaConfig
+
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.encoder = TFXLMRobertaEncoder(config, name="encoder")
+        self.pooler = TFXLMRobertaPooler(config, name="pooler") if add_pooling_layer else None
+        # The embeddings must be the last declaration in order to follow the weights order
+        self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings")
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Copied from `modeling_tf_t5.py`
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values[0] is not None:
+                # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta
+class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaConfig
+    base_model_prefix = "roberta"
+
+
+@add_start_docstrings(
+    "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = False,
+    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->XLMRoberta
+class TFXLMRobertaLMHead(keras.layers.Layer):
+    """XLMRoberta Head for masked language modeling."""
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.act = get_tf_activation("gelu")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+
+    def get_output_embeddings(self):
+        return self.decoder
+
+    def set_output_embeddings(self, value):
+        self.decoder.weight = value
+        self.decoder.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # project back to size of vocabulary with bias
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFXLMRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+@add_start_docstrings(
+    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config: XLMRobertaConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `TFXLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFXLMRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.lm_head(hidden_states=sequence_output, training=training)
+        loss = None
+
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->XLMRoberta
+class TFXLMRobertaClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.out_proj = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+        self.config = config
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    XLM RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.classifier = TFXLMRobertaClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'optimism'",
+        expected_loss=0.08,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        outputs = self.roberta(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    XLM RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-large-ner-english",
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
+        expected_loss=0.01,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    XLM RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-base-squad2",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.86,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: Optional[bool] = False,
+    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFXLMRobertaForCausalLM",
+    "TFXLMRobertaForMaskedLM",
+    "TFXLMRobertaForMultipleChoice",
+    "TFXLMRobertaForQuestionAnswering",
+    "TFXLMRobertaForSequenceClassification",
+    "TFXLMRobertaForTokenClassification",
+    "TFXLMRobertaModel",
+    "TFXLMRobertaPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe5823c20660b485f1ddc0454e4514971ead2da
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/modeling_xlm_roberta.py
@@ -0,0 +1,1714 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    get_torch_version,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_xlm_roberta import XLMRobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
+_CONFIG_FOR_DOC = "XLMRobertaConfig"
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->XLMRoberta
+class XLMRobertaEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->XLMRoberta
+class XLMRobertaSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaSelfAttention with Roberta->XLMRoberta
+class XLMRobertaSdpaSelfAttention(XLMRobertaSelfAttention):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__(config, position_embedding_type=position_embedding_type)
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+
+    # Adapted from XLMRobertaSelfAttention
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "XLMRobertaSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
+        # mask needs to be such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+        # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            key_layer, value_layer = past_key_value
+        else:
+            key_layer = self.transpose_for_scores(self.key(current_states))
+            value_layer = self.transpose_for_scores(self.value(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = (
+            True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
+        )
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+        outputs = (attn_output,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->XLMRoberta
+class XLMRobertaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+XLM_ROBERTA_SELF_ATTENTION_CLASSES = {
+    "eager": XLMRobertaSelfAttention,
+    "sdpa": XLMRobertaSdpaSelfAttention,
+}
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->XLMRoberta,ROBERTA->XLM_ROBERTA
+class XLMRobertaAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = XLM_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
+        self.output = XLMRobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->XLMRoberta
+class XLMRobertaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->XLMRoberta
+class XLMRobertaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->XLMRoberta
+class XLMRobertaLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XLMRobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = XLMRobertaAttention(config, position_embedding_type="absolute")
+        self.intermediate = XLMRobertaIntermediate(config)
+        self.output = XLMRobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->XLMRoberta
+class XLMRobertaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->XLMRoberta
+class XLMRobertaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->XLMRoberta
+class XLMRobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention", "XLMRobertaSdpaSelfAttention"]
+    _supports_sdpa = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XLMRobertaLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XLMRobertaLMHead):
+            module.bias.data.zero_()
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaModel(XLMRobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    _no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaLayer"]
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XLMRobertaEmbeddings(config)
+        self.encoder = XLMRobertaEncoder(config)
+
+        self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
+
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMRobertaForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+        >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
+        >>> config.is_decoder = True
+        >>> model = XLMRobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa Model with a `language modeling` head on top.""",
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
+class XLMRobertaLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.classifier = XLMRobertaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'optimism'",
+        expected_loss=0.08,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
+    a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = XLMRobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(
+        XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="Jean-Baptiste/roberta-large-ner-english",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
+        expected_loss=0.01,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->XLMRoberta
+class XLMRobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
+    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_ROBERTA_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
+class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="deepset/roberta-base-squad2",
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.86,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "XLMRobertaForCausalLM",
+    "XLMRobertaForMaskedLM",
+    "XLMRobertaForMultipleChoice",
+    "XLMRobertaForQuestionAnswering",
+    "XLMRobertaForSequenceClassification",
+    "XLMRobertaForTokenClassification",
+    "XLMRobertaModel",
+    "XLMRobertaPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d725bd6cacfb1cf28cd6c41a19b8d5e956e8ec
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for XLM-RoBERTa model."""
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+
+@requires(backends=("sentencepiece",))
+class XLMRobertaTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        # TODO check if the t5/llama PR also applies here
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XLMRobertaTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b51a9340dbfea0013235b72b17eb93d95385245e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for XLM-RoBERTa model."""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_xlm_roberta import XLMRobertaTokenizer
+else:
+    XLMRobertaTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+
+class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = XLMRobertaTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XLMRobertaTokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/__init__.py b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57aaceadacea8342c64364f5b712089228a0ab66
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlm_roberta_xl import *
+    from .modeling_xlm_roberta_xl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..dce18640a5b768c9c67adb29a516ce746ecc2f1b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM_ROBERTa_XL configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLMRobertaXLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMRobertaXLModel`] or a [`TFXLMRobertaXLModel`].
+    It is used to instantiate a XLM_ROBERTA_XL model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    XLM_ROBERTA_XL [facebook/xlm-roberta-xl](https://huggingface.co/facebook/xlm-roberta-xl) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250880):
+            Vocabulary size of the XLM_ROBERTA_XL model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`XLMRobertaXLModel`].
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 10240):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 1):
+            The vocabulary size of the `token_type_ids` passed when calling [`XLMRobertaXLModel`] or
+            [`TFXLMRobertaXLModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMRobertaXLConfig, XLMRobertaXLModel
+
+    >>> # Initializing a XLM_ROBERTA_XL google-bert/bert-base-uncased style configuration
+    >>> configuration = XLMRobertaXLConfig()
+
+    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
+    >>> model = XLMRobertaXLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlm-roberta-xl"
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=2560,
+        num_hidden_layers=36,
+        num_attention_heads=32,
+        intermediate_size=10240,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-05,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->XLMRobertaXL
+class XLMRobertaXLOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["XLMRobertaXLConfig", "XLMRobertaXLOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0fec32c387852535b90a2db111b2a487b1f61d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoBERTa checkpoint."""
+
+import argparse
+import pathlib
+
+import fairseq
+import torch
+from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from packaging import version
+
+from transformers import XLMRobertaConfig, XLMRobertaXLForMaskedLM, XLMRobertaXLForSequenceClassification
+from transformers.models.bert.modeling_bert import (
+    BertIntermediate,
+    BertLayer,
+    BertOutput,
+    BertSelfAttention,
+    BertSelfOutput,
+)
+from transformers.models.roberta.modeling_roberta import RobertaAttention
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("1.0.0a"):
+    raise Exception("requires fairseq >= 1.0.0a")
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello world! cécé herlolip"
+
+
+def convert_xlm_roberta_xl_checkpoint_to_pytorch(
+    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+    """
+    Copy/paste/tweak roberta's weights to our BERT structure.
+    """
+    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
+    roberta.eval()  # disable dropout
+    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
+    config = XLMRobertaConfig(
+        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
+        hidden_size=roberta.cfg.model.encoder_embed_dim,
+        num_hidden_layers=roberta.cfg.model.encoder_layers,
+        num_attention_heads=roberta.cfg.model.encoder_attention_heads,
+        intermediate_size=roberta.cfg.model.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
+    )
+    if classification_head:
+        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
+
+    print("Our RoBERTa config:", config)
+
+    model = XLMRobertaXLForSequenceClassification(config) if classification_head else XLMRobertaXLForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c RoBERTa doesn't use them.
+
+    model.roberta.encoder.LayerNorm.weight = roberta_sent_encoder.layer_norm.weight
+    model.roberta.encoder.LayerNorm.bias = roberta_sent_encoder.layer_norm.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.roberta.encoder.layer[i]
+        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
+
+        attention: RobertaAttention = layer.attention
+        attention.self_attn_layer_norm.weight = roberta_layer.self_attn_layer_norm.weight
+        attention.self_attn_layer_norm.bias = roberta_layer.self_attn_layer_norm.bias
+
+        # self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert (
+            roberta_layer.self_attn.k_proj.weight.data.shape
+            == roberta_layer.self_attn.q_proj.weight.data.shape
+            == roberta_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
+        )
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
+
+        # self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
+        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
+        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
+
+        # this one is final layer norm
+        layer.LayerNorm.weight = roberta_layer.final_layer_norm.weight
+        layer.LayerNorm.bias = roberta_layer.final_layer_norm.bias
+
+        # intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
+        intermediate.dense.weight = roberta_layer.fc1.weight
+        intermediate.dense.bias = roberta_layer.fc1.bias
+
+        # output
+        bert_output: BertOutput = layer.output
+        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
+        bert_output.dense.weight = roberta_layer.fc2.weight
+        bert_output.dense.bias = roberta_layer.fc2.bias
+        # end of layer
+
+    if classification_head:
+        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
+    else:
+        their_output = roberta.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
+    args = parser.parse_args()
+    convert_xlm_roberta_xl_checkpoint_to_pytorch(
+        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+    )
diff --git a/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad43c7903f4f1f48e489d9397c1466d16a615fcb
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py
@@ -0,0 +1,1707 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM RoBERTa xl,xxl model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    get_torch_version,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_xlm_roberta_xl import XLMRobertaXLConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "facebook/xlm-roberta-xl"
+_CONFIG_FOR_DOC = "XLMRobertaXLConfig"
+
+
+class XLMRobertaXLEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings.create_position_ids_from_inputs_embeds
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->XLMRobertaXL
+class XLMRobertaXLSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XLMRobertaXLModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSdpaSelfAttention with Bert->XLMRobertaXL
+class XLMRobertaXLSdpaSelfAttention(XLMRobertaXLSelfAttention):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__(config, position_embedding_type=position_embedding_type)
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+
+    # Adapted from XLMRobertaXLSelfAttention
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "XLMRobertaXLSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention
+        # mask needs to be such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+        # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            key_layer, value_layer = past_key_value
+        else:
+            key_layer = self.transpose_for_scores(self.key(current_states))
+            value_layer = self.transpose_for_scores(self.value(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = (
+            True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
+        )
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+        outputs = (attn_output,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class XLMRobertaXLSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+XLMROBERTAXL_SELF_ATTENTION_CLASSES = {
+    "eager": XLMRobertaXLSelfAttention,
+    "sdpa": XLMRobertaXLSdpaSelfAttention,
+}
+
+
+class XLMRobertaXLAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = XLMROBERTAXL_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
+        self.output = XLMRobertaXLSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        intermediate = self.self_attn_layer_norm(hidden_states)
+        self_outputs = self.self(
+            intermediate,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class XLMRobertaXLIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class XLMRobertaXLOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class XLMRobertaXLLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XLMRobertaXLAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = XLMRobertaXLAttention(config, position_embedding_type="absolute")
+        self.intermediate = XLMRobertaXLIntermediate(config)
+        self.output = XLMRobertaXLOutput(config)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.LayerNorm(attention_output)
+        intermediate_output = self.intermediate(intermediate_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class XLMRobertaXLEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([XLMRobertaXLLayer(config) for _ in range(config.num_hidden_layers)])
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        hidden_states = self.LayerNorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class XLMRobertaXLPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class XLMRobertaXLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLMRobertaXLConfig
+    base_model_prefix = "roberta"
+    _no_split_modules = ["XLMRobertaXLEmbeddings", "XLMRobertaXLLayer"]
+    _supports_sdpa = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XLMRobertaXLLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XLMRobertaXLLMHead):
+            module.bias.data.zero_()
+
+
+XLM_ROBERTA_XL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config ([`XLMRobertaXLConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_ROBERTA_XL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`. [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLM-RoBERTa-XL Model transformer outputting raw hidden-states without any specific head on top.",
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_bert.BertModel with Bert->XLMRobertaXL, BERT->XLM_ROBERTA_XL
+class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    _no_split_modules = ["XLMRobertaXLEmbeddings", "XLMRobertaXLLayer"]
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XLMRobertaXLEmbeddings(config)
+        self.encoder = XLMRobertaXLEncoder(config)
+
+        self.pooler = XLMRobertaXLPooler(config) if add_pooling_layer else None
+
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa-XL Model with a `language modeling` head on top for CLM fine-tuning.""",
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaXLLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+        self.lm_head.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, RobertaForCausalLM, RobertaConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
+        >>> config = RobertaConfig.from_pretrained("FacebookAI/roberta-base")
+        >>> config.is_decoder = True
+        >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        # Overwritten -- model logic breaks when `inputs_embeds` are passed from this function
+
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # Create missing `position_ids` on the fly
+        position_ids = None
+        if model_kwargs.get("position_ids") is None:
+            position_ids = create_position_ids_from_input_ids(
+                input_ids, padding_idx=self.config.pad_token_id
+            )  # placed in kwargs for further processing (see below)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+            if position_ids is not None:
+                position_ids = position_ids[:, remove_prefix_length:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """XLM-RoBERTa-XL Model with a `language modeling` head on top.""", XLM_ROBERTA_XL_START_DOCSTRING
+)
+class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
+        self.lm_head = XLMRobertaXLLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+        self.lm_head.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class XLMRobertaXLLMHead(nn.Module):
+    """XLM-RoBERTa-XL Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self) -> None:
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+            self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa-XL Model transformer with a sequence classification/regression head on top (a linear layer on top
+    of the pooled output) e.g. for GLUE tasks.
+    """,
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
+        self.classifier = XLMRobertaXLClassificationHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa-XL Model with a multiple choice classification head on top (a linear layer on top of the pooled
+    output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = XLMRobertaXLModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(
+        XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa-XL Model with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class XLMRobertaXLClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    XLM-RoBERTa-XL Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLM_ROBERTA_XL_START_DOCSTRING,
+)
+class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "XLMRobertaXLForCausalLM",
+    "XLMRobertaXLForMaskedLM",
+    "XLMRobertaXLForMultipleChoice",
+    "XLMRobertaXLForQuestionAnswering",
+    "XLMRobertaXLForSequenceClassification",
+    "XLMRobertaXLForTokenClassification",
+    "XLMRobertaXLModel",
+    "XLMRobertaXLPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/__init__.py b/docs/transformers/build/lib/transformers/models/xlnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f453455925328db06bf86ecc85c5acc8762dd26
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlnet import *
+    from .modeling_tf_xlnet import *
+    from .modeling_xlnet import *
+    from .tokenization_xlnet import *
+    from .tokenization_xlnet_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/configuration_xlnet.py b/docs/transformers/build/lib/transformers/models/xlnet/configuration_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a7238eb4c098b93c7aa911bb958ebe771a4539c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/configuration_xlnet.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLNet configuration"""
+
+import warnings
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLNetConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`XLNetModel`] or a [`TFXLNetModel`]. It is used to
+    instantiate a XLNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [xlnet/xlnet-large-cased](https://huggingface.co/xlnet/xlnet-large-cased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the XLNet model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XLNetModel`] or [`TFXLNetModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_inner (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
+            `"gelu_new"` are supported.
+        untie_r (`bool`, *optional*, defaults to `True`):
+            Whether or not to untie relative position biases
+        attn_type (`str`, *optional*, defaults to `"bi"`):
+            The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        mem_len (`int` or `None`, *optional*):
+            The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
+            forward pass won't be re-computed. See the
+            [quickstart](https://huggingface.co/transformers/quickstart.html#using-the-past) for more information.
+        reuse_len (`int`, *optional*):
+            The number of tokens in the current batch to be cached and reused in the future.
+        bi_data (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and `False`
+            during finetuning.
+        clamp_len (`int`, *optional*, defaults to -1):
+            Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
+        same_length (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the same attention length for each token.
+        summary_type (`str`, *optional*, defaults to "last"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`boo`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_last_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        use_mems_eval (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should make use of the recurrent memory mechanism in evaluation mode.
+        use_mems_train (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should make use of the recurrent memory mechanism in train mode.
+
+            <Tip>
+
+            For pretraining, it is recommended to set `use_mems_train` to `True`. For fine-tuning, it is recommended to
+            set `use_mems_train` to `False` as discussed
+            [here](https://github.com/zihangdai/xlnet/issues/41#issuecomment-505102587). If `use_mems_train` is set to
+            `True`, one has to make sure that the train batches are correctly pre-processed, *e.g.* `batch_1 = [[This
+            line is], [This is the]]` and `batch_2 = [[ the first line], [ second line]]` and that all batches are of
+            equal size.
+
+            </Tip>
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLNetConfig, XLNetModel
+
+    >>> # Initializing a XLNet configuration
+    >>> configuration = XLNetConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = XLNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlnet"
+    keys_to_ignore_at_inference = ["mems"]
+    attribute_map = {
+        "n_token": "vocab_size",  # Backward compatibility
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        d_model=1024,
+        n_layer=24,
+        n_head=16,
+        d_inner=4096,
+        ff_activation="gelu",
+        untie_r=True,
+        attn_type="bi",
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        dropout=0.1,
+        mem_len=512,
+        reuse_len=None,
+        use_mems_eval=True,
+        use_mems_train=False,
+        bi_data=False,
+        clamp_len=-1,
+        same_length=False,
+        summary_type="last",
+        summary_use_proj=True,
+        summary_activation="tanh",
+        summary_last_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        pad_token_id=5,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        """Constructs XLNetConfig."""
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.n_head = n_head
+        if d_model % n_head != 0:
+            raise ValueError(f"'d_model % n_head' ({d_model % n_head}) should be equal to 0")
+        if "d_head" in kwargs:
+            if kwargs["d_head"] != d_model // n_head:
+                raise ValueError(
+                    f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})"
+                )
+        self.d_head = d_model // n_head
+        self.ff_activation = ff_activation
+        self.d_inner = d_inner
+        self.untie_r = untie_r
+        self.attn_type = attn_type
+
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.dropout = dropout
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
+
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+
+        if "use_cache" in kwargs:
+            warnings.warn(
+                "The `use_cache` argument is deprecated and will be removed in a future version, use `use_mems_eval`"
+                " instead.",
+                FutureWarning,
+            )
+            use_mems_eval = kwargs["use_cache"]
+
+        self.use_mems_eval = use_mems_eval
+        self.use_mems_train = use_mems_train
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def max_position_embeddings(self):
+        logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
+        return -1
+
+    @max_position_embeddings.setter
+    def max_position_embeddings(self, value):
+        # Message copied from Transformer-XL documentation
+        raise NotImplementedError(
+            f"The model {self.model_type} is one of the few models that has no sequence length limit."
+        )
+
+
+__all__ = ["XLNetConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..a15c5f22ad681a749b6a9ac0cfb48fe3ba92dfcc
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/convert_xlnet_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+import argparse
+import os
+
+import torch
+
+from transformers import (
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetForSequenceClassification,
+    XLNetLMHeadModel,
+    load_tf_weights_in_xlnet,
+)
+from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
+
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+
+logging.set_verbosity_info()
+
+
+def convert_xlnet_checkpoint_to_pytorch(
+    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
+):
+    # Initialise PyTorch model
+    config = XLNetConfig.from_json_file(bert_config_file)
+
+    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
+    if finetuning_task in GLUE_TASKS_NUM_LABELS:
+        print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
+        config.finetuning_task = finetuning_task
+        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
+        model = XLNetForSequenceClassification(config)
+    elif "squad" in finetuning_task:
+        config.finetuning_task = finetuning_task
+        model = XLNetForQuestionAnswering(config)
+    else:
+        model = XLNetLMHeadModel(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+    print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--xlnet_config_file",
+        default=None,
+        type=str,
+        required=True,
+        help=(
+            "The config json file corresponding to the pre-trained XLNet model. \n"
+            "This specifies the model architecture."
+        ),
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the folder to store the PyTorch model or dataset/vocab.",
+    )
+    parser.add_argument(
+        "--finetuning_task",
+        default=None,
+        type=str,
+        help="Name of a task on which the XLNet TensorFlow model was fine-tuned",
+    )
+    args = parser.parse_args()
+    print(args)
+
+    convert_xlnet_checkpoint_to_pytorch(
+        args.tf_checkpoint_path, args.xlnet_config_file, args.pytorch_dump_folder_path, args.finetuning_task
+    )
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/modeling_tf_xlnet.py b/docs/transformers/build/lib/transformers/models/xlnet/modeling_tf_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2383352ae6af0cafb538aef15e8c8c66c3501d43
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -0,0 +1,1821 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 XLNet model.
+"""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_xlnet import XLNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
+_CONFIG_FOR_DOC = "XLNetConfig"
+
+
+class TFXLNetRelativeAttention(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.d_model % config.n_head != 0:
+            raise ValueError(
+                f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
+                f"heads ({config.n_head}"
+            )
+
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head**0.5)
+        self.initializer_range = config.initializer_range
+        self.output_attentions = config.output_attentions
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.config = config
+
+    def build(self, input_shape=None):
+        initializer = get_initializer(self.initializer_range)
+        self.q = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="q"
+        )
+        self.k = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="k"
+        )
+        self.v = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="v"
+        )
+        self.o = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="o"
+        )
+        self.r = self.add_weight(
+            shape=(self.d_model, self.n_head, self.d_head), initializer=initializer, trainable=True, name="r"
+        )
+        self.r_r_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+        )
+        self.r_s_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_s_bias"
+        )
+        self.r_w_bias = self.add_weight(
+            shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+        )
+        self.seg_embed = self.add_weight(
+            shape=(2, self.n_head, self.d_head), initializer=initializer, trainable=True, name="seg_embed"
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def rel_shift(self, x, klen=-1):
+        """perform relative shift to form the relative attention score."""
+        x_size = shape_list(x)
+
+        x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
+        x = x[1:, ...]
+        x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
+        x = x[:, 0:klen, :, :]
+        # x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
+
+        return x
+
+    def rel_attn_core(
+        self, q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask, output_attentions, training=False
+    ):
+        """Core relative positional attention operations."""
+        # content based attention score
+        ac = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_w_bias, k_head_h)
+
+        # position based attention score
+        bd = tf.einsum("ibnd,jbnd->ijbn", q_head + self.r_r_bias, k_head_r)
+        bd = self.rel_shift(bd, klen=shape_list(ac)[1])
+
+        # segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = tf.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = tf.einsum("ijbs,ibns->ijbn", seg_mat, ef)
+
+        # merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+        if attn_mask is not None:
+            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+            if attn_mask.dtype == tf.float16 or attn_mask.dtype == tf.bfloat16:
+                attn_score = attn_score - 65500 * attn_mask
+            else:
+                attn_score = attn_score - 1e30 * attn_mask
+
+        # attention probability
+        attn_prob = stable_softmax(attn_score, axis=1)
+
+        attn_prob = self.dropout(attn_prob, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # attention output
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, v_head_h)
+
+        if output_attentions:
+            return attn_vec, attn_prob
+
+        return attn_vec
+
+    def post_attention(self, h, attn_vec, residual=True, training=False):
+        """Post-attention processing."""
+        # post-attention projection (back to `d_model`)
+        attn_out = tf.einsum("ibnd,hnd->ibh", attn_vec, self.o)
+
+        attn_out = self.dropout(attn_out, training=training)
+
+        if residual:
+            attn_out = attn_out + h
+        output = self.layer_norm(attn_out)
+
+        return output
+
+    def call(
+        self,
+        h,
+        g,
+        attn_mask_h,
+        attn_mask_g,
+        r,
+        seg_mat,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        training: bool = False,
+    ):
+        if g is not None:
+            # Two-stream attention with relative positional encoding.
+            # content based attention score
+            if mems is not None and len(shape_list(mems)) > 1:
+                cat = tf.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # content-based key head
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
+
+            # content-based value head
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
+
+            # position-based key head
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
+
+            # h-stream
+            # content-stream query head
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
+
+            # core attention ops
+            attn_vec_h = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat,
+                attn_mask_h,
+                head_mask,
+                output_attentions,
+                training=training,
+            )
+
+            if output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec_h, training=training)
+
+            # g-stream
+            # query-stream query head
+            q_head_g = tf.einsum("ibh,hnd->ibnd", g, self.q)
+
+            # core attention ops
+            if target_mapping is not None:
+                q_head_g = tf.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat,
+                    attn_mask_g,
+                    head_mask,
+                    output_attentions,
+                    training=training,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+                attn_vec_g = tf.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat,
+                    attn_mask_g,
+                    head_mask,
+                    output_attentions,
+                    training=training,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+            # post processing
+            output_g = self.post_attention(g, attn_vec_g, training=training)
+
+            if output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
+        else:
+            # Multi-head attention with relative positional encoding
+            if mems is not None and len(shape_list(mems)) > 1:
+                cat = tf.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # content heads
+            q_head_h = tf.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = tf.einsum("ibh,hnd->ibnd", cat, self.v)
+
+            # positional heads
+            k_head_r = tf.einsum("ibh,hnd->ibnd", r, self.r)
+
+            # core attention ops
+            attn_vec = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat,
+                attn_mask_h,
+                head_mask,
+                output_attentions,
+                training=training,
+            )
+
+            if output_attentions:
+                attn_vec, attn_prob = attn_vec
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec, training=training)
+            output_g = None
+
+        outputs = (output_h, output_g)
+        if output_attentions:
+            outputs = outputs + (attn_prob,)
+        return outputs
+
+
+class TFXLNetFeedForward(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.layer_1 = keras.layers.Dense(
+            config.d_inner, kernel_initializer=get_initializer(config.initializer_range), name="layer_1"
+        )
+        self.layer_2 = keras.layers.Dense(
+            config.d_model, kernel_initializer=get_initializer(config.initializer_range), name="layer_2"
+        )
+        self.dropout = keras.layers.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str):
+            self.activation_function = get_tf_activation(config.ff_activation)
+        else:
+            self.activation_function = config.ff_activation
+        self.config = config
+
+    def call(self, inp, training=False):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output, training=training)
+        output = self.layer_2(output)
+        output = self.dropout(output, training=training)
+        output = self.layer_norm(output + inp)
+        return output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layer_1", None) is not None:
+            with tf.name_scope(self.layer_1.name):
+                self.layer_1.build([None, None, self.config.d_model])
+        if getattr(self, "layer_2", None) is not None:
+            with tf.name_scope(self.layer_2.name):
+                self.layer_2.build([None, None, self.config.d_inner])
+
+
+class TFXLNetLayer(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.rel_attn = TFXLNetRelativeAttention(config, name="rel_attn")
+        self.ff = TFXLNetFeedForward(config, name="ff")
+        self.dropout = keras.layers.Dropout(config.dropout)
+
+    def call(
+        self,
+        output_h,
+        output_g,
+        non_tgt_mask,
+        attn_mask,
+        pos_emb,
+        seg_mat,
+        mems: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        output_attentions: Optional[bool] = False,
+        training: bool = False,
+    ):
+        outputs = self.rel_attn(
+            output_h,
+            output_g,
+            non_tgt_mask,
+            attn_mask,
+            pos_emb,
+            seg_mat,
+            mems,
+            target_mapping,
+            head_mask,
+            output_attentions,
+            training=training,
+        )
+        output_h, output_g = outputs[:2]
+
+        if output_g is not None:
+            output_g = self.ff(output_g, training=training)
+        output_h = self.ff(output_h, training=training)
+
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "rel_attn", None) is not None:
+            with tf.name_scope(self.rel_attn.name):
+                self.rel_attn.build(None)
+        if getattr(self, "ff", None) is not None:
+            with tf.name_scope(self.ff.name):
+                self.ff.build(None)
+
+
+class TFXLNetLMHead(keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+@keras_serializable
+class TFXLNetMainLayer(keras.layers.Layer):
+    config_class = XLNetConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.return_dict = config.return_dict
+
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
+        self.use_bfloat16 = config.use_bfloat16
+        self.initializer_range = config.initializer_range
+
+        self.word_embedding = TFSharedEmbeddings(
+            config.vocab_size, config.d_model, initializer_range=config.initializer_range, name="word_embedding"
+        )
+        self.layer = [TFXLNetLayer(config, name=f"layer_._{i}") for i in range(config.n_layer)]
+        self.dropout = keras.layers.Dropout(config.dropout)
+
+        self.use_mems_eval = config.use_mems_eval
+        self.use_mems_train = config.use_mems_train
+
+    def get_input_embeddings(self):
+        return self.word_embedding
+
+    def set_input_embeddings(self, value):
+        self.word_embedding.weight = value
+        self.word_embedding.vocab_size = shape_list(value)[0]
+
+    def build(self, input_shape=None):
+        initializer = get_initializer(self.initializer_range)
+        self.mask_emb = self.add_weight(
+            shape=(1, 1, self.d_model), initializer=initializer, trainable=True, name="mask_emb"
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "word_embedding", None) is not None:
+            with tf.name_scope(self.word_embedding.name):
+                self.word_embedding.build(None)
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def create_mask(self, qlen, mlen):
+        """
+        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+
+        Args:
+            qlen: TODO Lysandre didn't fill
+            mlen: TODO Lysandre didn't fill
+
+        ```
+
+                  same_length=False:      same_length=True:
+                  <mlen > <  qlen >       <mlen > <  qlen >
+               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
+                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
+            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
+                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
+               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
+        ```
+        """
+        attn_mask = tf.ones([qlen, qlen])
+        mask_u = tf.linalg.band_part(attn_mask, 0, -1)
+        mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
+        attn_mask_pad = tf.zeros([qlen, mlen])
+        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
+        if self.same_length:
+            mask_l = tf.linalg.band_part(attn_mask, -1, 0)
+            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        # cache hidden states into memory.
+        if self.reuse_len is not None and self.reuse_len > 0:
+            curr_out = curr_out[: self.reuse_len]
+
+        if self.mem_len is None or self.mem_len == 0:
+            # If `use_mems` is active but no `mem_len` is defined, the model behaves like GPT-2 at inference time
+            # and returns all of the past and current hidden states.
+            cutoff = 0
+        else:
+            # If `use_mems` is active and `mem_len` is defined, the model returns the last `mem_len` hidden
+            # states. This is the preferred setting for training and long-form generation.
+            cutoff = -self.mem_len
+        if prev_mem is None:
+            # if `use_mems` is active and `mem_len` is defined, the model
+            new_mem = curr_out[cutoff:]
+        else:
+            new_mem = tf.concat([prev_mem, curr_out], 0)[cutoff:]
+
+        return tf.stop_gradient(new_mem)
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = tf.einsum("i,d->id", pos_seq, inv_freq)
+        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = tf.tile(pos_emb, [1, bsz, 1])
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
+        """create relative positional encoding."""
+        freq_seq = tf.range(0, self.d_model, 2.0)
+        inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
+
+        if self.attn_type == "bi":
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == "uni":
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
+
+        if self.bi_data:
+            fwd_pos_seq = tf.range(beg, end, -1.0)
+            bwd_pos_seq = tf.range(-beg, -end, 1.0)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
+                bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                if bsz % 2 != 0:
+                    raise ValueError(f"With bi_data, the batch size {bsz} should be divisible by 2")
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
+        else:
+            fwd_pos_seq = tf.range(beg, end, -1.0)
+            if self.clamp_len > 0:
+                fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        return pos_emb
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ):
+        if training and use_mems is None:
+            use_mems = self.use_mems_train
+        else:
+            use_mems = self.use_mems_eval
+
+        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = tf.transpose(input_ids, perm=(1, 0))
+            qlen, bsz = shape_list(input_ids)[:2]
+        elif inputs_embeds is not None:
+            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
+            qlen, bsz = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
+        input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
+        attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
+        perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
+        target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
+
+        mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
+        klen = mlen + qlen
+
+        # Attention mask
+        # causal attention mask
+        if self.attn_type == "uni":
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == "bi":
+            attn_mask = None
+        else:
+            raise ValueError(f"Unsupported attention type: {self.attn_type}")
+
+        # data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, (
+            "You can only use one of input_mask (uses 1 for padding) "
+            "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
+        )
+        if input_mask is None and attention_mask is not None:
+            one_cst = tf.constant(1.0)
+            input_mask = 1.0 - tf.cast(attention_mask, dtype=one_cst.dtype)
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            if mlen > 0:
+                mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz])
+                data_mask = tf.concat([mems_mask, data_mask], axis=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = tf.cast(attn_mask > 0, dtype=attn_mask.dtype)
+
+        if attn_mask is not None:
+            non_tgt_mask = -tf.eye(qlen)
+            if mlen > 0:
+                non_tgt_mask = tf.concat([tf.zeros([qlen, mlen]), non_tgt_mask], axis=-1)
+            non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=non_tgt_mask.dtype)
+        else:
+            non_tgt_mask = None
+
+        # Word embeddings and prepare h & g hidden states
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            check_embeddings_within_bounds(input_ids, self.word_embedding.vocab_size)
+            word_emb_k = self.word_embedding(input_ids)
+        output_h = self.dropout(word_emb_k, training=training)
+        if target_mapping is not None:
+            word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q, training=training)
+        else:
+            output_g = None
+
+        # Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            if mlen > 0:
+                mem_pad = tf.zeros([mlen, bsz], dtype=token_type_ids.dtype)
+                cat_ids = tf.concat([mem_pad, token_type_ids], 0)
+            else:
+                cat_ids = token_type_ids
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = tf.cast(
+                tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
+                dtype=token_type_ids.dtype,
+            )
+            seg_mat = tf.one_hot(seg_mat, 2)
+        else:
+            seg_mat = None
+
+        # Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
+        pos_emb = self.dropout(pos_emb, training=training)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layer
+
+        new_mems = ()
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        attentions = [] if output_attentions else None
+        hidden_states = [] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            # cache new mems
+            if use_mems:
+                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
+            if output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module(
+                output_h,
+                output_g,
+                non_tgt_mask,
+                attn_mask,
+                pos_emb,
+                seg_mat,
+                mems[i],
+                target_mapping,
+                head_mask[i],
+                output_attentions,
+                training=training,
+            )
+            output_h, output_g = outputs[:2]
+            if output_attentions:
+                attentions.append(outputs[2])
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+        output = self.dropout(output_g if output_g is not None else output_h, training=training)
+
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        output = tf.transpose(output, perm=(1, 0, 2))
+
+        if not use_mems:
+            new_mems = None
+        if output_hidden_states:
+            if output_g is not None:
+                hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
+            else:
+                hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
+        if output_attentions:
+            if target_mapping is not None:
+                # when target_mapping is provided, there are 2-tuple of attentions
+                attentions = tuple(
+                    tuple(tf.transpose(attn_stream, perm=(2, 3, 0, 1)) for attn_stream in t) for t in attentions
+                )
+            else:
+                attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
+
+        if not return_dict:
+            return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
+
+        return TFXLNetModelOutput(
+            last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+class TFXLNetPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLNetConfig
+    base_model_prefix = "transformer"
+
+
+@dataclass
+class TFXLNetModelOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetModel`].
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, num_predict, hidden_size)`):
+            Sequence of hidden-states at the last layer of the model.
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFXLNetLMHeadModelOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetLMHeadModel`].
+
+    Args:
+        loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
+            Language modeling loss (for next-token prediction).
+        logits (`tf.Tensor` of shape `(batch_size, num_predict, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFXLNetForSequenceClassificationOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetForSequenceClassification`].
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFXLNetForTokenClassificationOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetForTokenClassificationOutput`].
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFXLNetForMultipleChoiceOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetForMultipleChoice`].
+
+    Args:
+        loss (`tf.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
+            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+            Classification scores (before SoftMax).
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFXLNetForQuestionAnsweringSimpleOutput(ModelOutput):
+    """
+    Output type of [`TFXLNetForQuestionAnsweringSimple`].
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_logits (`tf.Tensor` of shape `(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        mems (`List[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    start_logits: Optional[tf.Tensor] = None
+    end_logits: Optional[tf.Tensor] = None
+    mems: List[tf.Tensor] | None = None
+    hidden_states: Tuple[tf.Tensor, ...] | None = None
+    attentions: Tuple[tf.Tensor, ...] | None = None
+
+
+XLNET_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLNET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (see `mems` output below) . Can be used to speed up sequential
+            decoding. The token ids which have their past given to this model should not be passed as `input_ids` as
+            they have already been computed.
+
+            `use_mems` has to be set to `True` to make use of `mems`.
+        perm_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+            Mask to indicate the attention pattern for each input token with values selected in `[0, 1]`:
+
+            - if `perm_mask[k, i, j] = 0`, i attend to j in batch k;
+            - if `perm_mask[k, i, j] = 1`, i does not attend to j in batch k.
+
+            If not set, each token attends to all the others (full bidirectional attention). Only used during
+            pretraining (to define factorization order) or for sequential decoding (generation).
+        target_mapping (`torch.FloatTensor` of shape `(batch_size, num_predict, sequence_length)`, *optional*):
+            Mask to indicate the output tokens to use. If `target_mapping[k, i, j] = 1`, the i-th predict in batch k is
+            on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
+            (generation).
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        input_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
+            Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for
+            real tokens and 1 for padding which is kept for compatibility with the original code base.
+
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **masked**,
+            - 0 for tokens that are **not masked**.
+
+            You can only uses one of `input_mask` and `attention_mask`.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetModel(TFXLNetPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLNetModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFXLNetModelOutput, Tuple[tf.Tensor]]:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name="lm_loss")
+        # generate fails to convert to a graph with XLNet
+        self.supports_xla_generation = False
+
+    def get_lm_head(self):
+        return self.lm_loss
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_loss.name
+
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_mems=None, **kwargs):
+        # Add dummy token at the end (no attention on this one)
+        effective_batch_size = inputs.shape[0]
+        dummy_token = tf.zeros((effective_batch_size, 1), dtype=inputs.dtype)
+
+        # At every pass, the attention values for the new token and the two last generated tokens
+        # are computed, the rest is reloaded from the `past` cache. A purely auto-regressive model would have
+        # offset = 1; offset = 2 seems to have slightly better computation.
+        offset = 2
+
+        if past_key_values:
+            input_ids = tf.concat([inputs[:, -offset:], dummy_token], axis=1)
+        else:
+            input_ids = tf.concat([inputs, dummy_token], axis=1)
+
+        # Build permutation mask so that previous tokens don't see last token
+        sequence_length = input_ids.shape[1]
+        perm_mask = tf.zeros((effective_batch_size, sequence_length, sequence_length - 1))
+        perm_mask_seq_end = tf.ones((effective_batch_size, sequence_length, 1))
+        perm_mask = tf.concat([perm_mask, perm_mask_seq_end], axis=-1)
+
+        # We'll only predict the last token
+        target_mapping = tf.zeros((effective_batch_size, 1, sequence_length - 1))
+        target_mapping_seq_end = tf.ones((effective_batch_size, 1, 1))
+        target_mapping = tf.concat([target_mapping, target_mapping_seq_end], axis=-1)
+
+        inputs = {
+            "input_ids": input_ids,
+            "perm_mask": perm_mask,
+            "target_mapping": target_mapping,
+            "use_mems": use_mems,
+        }
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values)
+
+        return inputs
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFXLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFXLNetLMHeadModelOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> import numpy as np
+        >>> from transformers import AutoTokenizer, TFXLNetLMHeadModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-large-cased")
+        >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet/xlnet-large-cased")
+
+        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[
+        ...     None, :
+        ... ]  # We will predict the masked token
+
+        >>> perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
+        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+
+        >>> target_mapping = np.zeros(
+        ...     (1, 1, input_ids.shape[1])
+        ... )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        ... ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
+        >>> outputs = model(
+        ...     input_ids,
+        ...     perm_mask=tf.constant(perm_mask, dtype=tf.float32),
+        ...     target_mapping=tf.constant(target_mapping, dtype=tf.float32),
+        ... )
+
+        >>> next_token_logits = outputs[
+        ...     0
+        ... ]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        ```"""
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_state = transformer_outputs[0]
+        logits = self.lm_loss(hidden_state, training=training)
+
+        loss = None
+        if labels is not None:
+            loss = self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFXLNetLMHeadModelOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "lm_loss", None) is not None:
+            with tf.name_scope(self.lm_loss.name):
+                self.lm_loss.build(None)
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.logits_proj = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLNetForSequenceClassificationOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFXLNetForSequenceClassificationOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFXLNetForSequenceClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.d_model])
+
+
+@add_start_docstrings(
+    """
+    XLNET Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="sequence_summary"
+        )
+        self.logits_proj = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLNetForMultipleChoiceOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFXLNetForMultipleChoiceOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_input_mask = tf.reshape(input_mask, (-1, seq_length)) if input_mask is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            flat_attention_mask,
+            mems,
+            perm_mask,
+            target_mapping,
+            flat_token_type_ids,
+            flat_input_mask,
+            head_mask,
+            flat_inputs_embeds,
+            use_mems,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFXLNetForMultipleChoiceOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.d_model])
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLNetForTokenClassificationOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFXLNetForTokenClassificationOutput, Tuple[tf.Tensor]]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.classifier(output)
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFXLNetForTokenClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLNET_START_DOCSTRING,
+)
+class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFXLNetMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFXLNetForQuestionAnsweringSimpleOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        mems: np.ndarray | tf.Tensor | None = None,
+        perm_mask: np.ndarray | tf.Tensor | None = None,
+        target_mapping: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        input_mask: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> Union[TFXLNetForQuestionAnsweringSimpleOutput, Tuple[tf.Tensor]]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFXLNetForQuestionAnsweringSimpleOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFXLNetForMultipleChoice",
+    "TFXLNetForQuestionAnsweringSimple",
+    "TFXLNetForSequenceClassification",
+    "TFXLNetForTokenClassification",
+    "TFXLNetLMHeadModel",
+    "TFXLNetMainLayer",
+    "TFXLNetModel",
+    "TFXLNetPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/modeling_xlnet.py b/docs/transformers/build/lib/transformers/models/xlnet/modeling_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7446e57bb6d5ae0c8ff2f6d2fe7c09c7a80943
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/modeling_xlnet.py
@@ -0,0 +1,2372 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch XLNet model.
+"""
+
+import warnings
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, get_activation
+from ...generation import GenerationMixin
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_xlnet import XLNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "xlnet/xlnet-base-cased"
+_CONFIG_FOR_DOC = "XLNetConfig"
+
+
+def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None):
+    """
+    A map of modules from TF to PyTorch. I use a map to keep the PyTorch model as identical to the original PyTorch
+    model as possible.
+    """
+
+    tf_to_pt_map = {}
+
+    if hasattr(model, "transformer"):
+        if hasattr(model, "lm_loss"):
+            # We will load also the output bias
+            tf_to_pt_map["model/lm_loss/bias"] = model.lm_loss.bias
+        if hasattr(model, "sequence_summary") and "model/sequnece_summary/summary/kernel" in tf_weights:
+            # We will load also the sequence summary
+            tf_to_pt_map["model/sequnece_summary/summary/kernel"] = model.sequence_summary.summary.weight
+            tf_to_pt_map["model/sequnece_summary/summary/bias"] = model.sequence_summary.summary.bias
+        if (
+            hasattr(model, "logits_proj")
+            and config.finetuning_task is not None
+            and f"model/regression_{config.finetuning_task}/logit/kernel" in tf_weights
+        ):
+            tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/kernel"] = model.logits_proj.weight
+            tf_to_pt_map[f"model/regression_{config.finetuning_task}/logit/bias"] = model.logits_proj.bias
+
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings and output
+    tf_to_pt_map.update(
+        {
+            "model/transformer/word_embedding/lookup_table": model.word_embedding.weight,
+            "model/transformer/mask_emb/mask_emb": model.mask_emb,
+        }
+    )
+
+    # Transformer blocks
+    for i, b in enumerate(model.layer):
+        layer_str = f"model/transformer/layer_{i}/"
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.rel_attn.o,
+                layer_str + "rel_attn/q/kernel": b.rel_attn.q,
+                layer_str + "rel_attn/k/kernel": b.rel_attn.k,
+                layer_str + "rel_attn/r/kernel": b.rel_attn.r,
+                layer_str + "rel_attn/v/kernel": b.rel_attn.v,
+                layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight,
+                layer_str + "ff/layer_1/bias": b.ff.layer_1.bias,
+                layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight,
+                layer_str + "ff/layer_2/bias": b.ff.layer_2.bias,
+            }
+        )
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        r_s_list = []
+        seg_embed_list = []
+        for b in model.layer:
+            r_r_list.append(b.rel_attn.r_r_bias)
+            r_w_list.append(b.rel_attn.r_w_bias)
+            r_s_list.append(b.rel_attn.r_s_bias)
+            seg_embed_list.append(b.rel_attn.seg_embed)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+        r_s_list = [model.r_s_bias]
+        seg_embed_list = [model.seg_embed]
+    tf_to_pt_map.update(
+        {
+            "model/transformer/r_r_bias": r_r_list,
+            "model/transformer/r_w_bias": r_w_list,
+            "model/transformer/r_s_bias": r_s_list,
+            "model/transformer/seg_embed": seg_embed_list,
+        }
+    )
+    return tf_to_pt_map
+
+
+def load_tf_weights_in_xlnet(model, config, tf_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights)
+
+    for name, pointer in tf_to_pt_map.items():
+        logger.info(f"Importing {name}")
+        if name not in tf_weights:
+            logger.info(f"{name} not in tf pre-trained weights, skipping")
+            continue
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if "kernel" in name and ("ff" in name or "summary" in name or "logit" in name):
+            logger.info("Transposing")
+            array = np.transpose(array)
+        if isinstance(pointer, list):
+            # Here we will split the TF weights
+            assert len(pointer) == array.shape[0], (
+                f"Pointer length {len(pointer)} and array length {array.shape[0]} mismatched"
+            )
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape, (
+                        f"Pointer shape {p_i.shape} and array shape {arr_i.shape} mismatched"
+                    )
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                logger.info(f"Initialize PyTorch weight {name} for layer {i}")
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            logger.info(f"Initialize PyTorch weight {name}")
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
+    return model
+
+
+class XLNetRelativeAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        if config.d_model % config.n_head != 0:
+            raise ValueError(
+                f"The hidden size ({config.d_model}) is not a multiple of the number of attention "
+                f"heads ({config.n_head}"
+            )
+
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head**0.5)
+
+        self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+        self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
+
+        self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))
+
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def rel_shift(x, klen=-1):
+        """perform relative shift to form the relative attention score."""
+        x_size = x.shape
+
+        x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3])
+        x = x[1:, ...]
+        x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3])
+        # x = x[:, 0:klen, :, :]
+        x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
+
+        return x
+
+    @staticmethod
+    def rel_shift_bnij(x, klen=-1):
+        x_size = x.shape
+
+        x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2])
+        x = x[:, :, 1:, :]
+        x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3] - 1)
+        # Note: the tensor-slice form was faster in my testing than torch.index_select
+        #       However, tracing doesn't like the nature of the slice, and if klen changes
+        #       during the run then it'll fail, whereas index_select will be fine.
+        x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long))
+        # x = x[:, :, :, :klen]
+
+        return x
+
+    def rel_attn_core(
+        self,
+        q_head,
+        k_head_h,
+        v_head_h,
+        k_head_r,
+        seg_mat=None,
+        attn_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        """Core relative positional attention operations."""
+
+        # content based attention score
+        ac = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
+
+        # position based attention score
+        bd = torch.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
+        bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
+
+        # segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = torch.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = torch.einsum("ijbs,ibns->bnij", seg_mat, ef)
+
+        # merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+        if attn_mask is not None:
+            # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+            if attn_mask.dtype == torch.float16:
+                attn_score = attn_score - 65500 * torch.einsum("ijbn->bnij", attn_mask)
+            else:
+                attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask)
+
+        # attention probability
+        attn_prob = nn.functional.softmax(attn_score, dim=3)
+        attn_prob = self.dropout(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * torch.einsum("ijbn->bnij", head_mask)
+
+        # attention output
+        attn_vec = torch.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
+
+        if output_attentions:
+            return attn_vec, torch.einsum("bnij->ijbn", attn_prob)
+
+        return attn_vec
+
+    def post_attention(self, h, attn_vec, residual=True):
+        """Post-attention processing."""
+        # post-attention projection (back to `d_model`)
+        attn_out = torch.einsum("ibnd,hnd->ibh", attn_vec, self.o)
+
+        attn_out = self.dropout(attn_out)
+        if residual:
+            attn_out = attn_out + h
+        output = self.layer_norm(attn_out)
+
+        return output
+
+    def forward(
+        self,
+        h,
+        g,
+        attn_mask_h,
+        attn_mask_g,
+        r,
+        seg_mat,
+        mems=None,
+        target_mapping=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        if g is not None:
+            # Two-stream attention with relative positional encoding.
+            # content based attention score
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content-based key head
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
+
+            # content-based value head
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
+
+            # position-based key head
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r, self.r)
+
+            # h-stream
+            # content-stream query head
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
+
+            # core attention ops
+            attn_vec_h = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat=seg_mat,
+                attn_mask=attn_mask_h,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec_h)
+
+            # g-stream
+            # query-stream query head
+            q_head_g = torch.einsum("ibh,hnd->ibnd", g, self.q)
+
+            # core attention ops
+            if target_mapping is not None:
+                q_head_g = torch.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat=seg_mat,
+                    attn_mask=attn_mask_g,
+                    head_mask=head_mask,
+                    output_attentions=output_attentions,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+                attn_vec_g = torch.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat=seg_mat,
+                    attn_mask=attn_mask_g,
+                    head_mask=head_mask,
+                    output_attentions=output_attentions,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+            # post processing
+            output_g = self.post_attention(g, attn_vec_g)
+
+            if output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
+        else:
+            # Multi-head attention with relative positional encoding
+            if mems is not None and mems.dim() > 1:
+                cat = torch.cat([mems, h], dim=0)
+            else:
+                cat = h
+
+            # content heads
+            q_head_h = torch.einsum("ibh,hnd->ibnd", h, self.q)
+            k_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.k)
+            v_head_h = torch.einsum("ibh,hnd->ibnd", cat, self.v)
+
+            # positional heads
+            # type casting for fp16 support
+            k_head_r = torch.einsum("ibh,hnd->ibnd", r.type(self.r.dtype), self.r)
+
+            # core attention ops
+            attn_vec = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat=seg_mat,
+                attn_mask=attn_mask_h,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                attn_vec, attn_prob = attn_vec
+
+            # post processing
+            output_h = self.post_attention(h, attn_vec)
+            output_g = None
+
+        outputs = (output_h, output_g)
+        if output_attentions:
+            outputs = outputs + (attn_prob,)
+        return outputs
+
+
+class XLNetFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
+        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str):
+            self.activation_function = ACT2FN[config.ff_activation]
+        else:
+            self.activation_function = config.ff_activation
+
+    def forward(self, inp):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output)
+        output = self.layer_2(output)
+        output = self.dropout(output)
+        output = self.layer_norm(output + inp)
+        return output
+
+
+class XLNetLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.rel_attn = XLNetRelativeAttention(config)
+        self.ff = XLNetFeedForward(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+    def forward(
+        self,
+        output_h,
+        output_g,
+        attn_mask_h,
+        attn_mask_g,
+        r,
+        seg_mat,
+        mems=None,
+        target_mapping=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        outputs = self.rel_attn(
+            output_h,
+            output_g,
+            attn_mask_h,
+            attn_mask_g,
+            r,
+            seg_mat,
+            mems=mems,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        output_h, output_g = outputs[:2]
+
+        if output_g is not None:
+            output_g = apply_chunking_to_forward(
+                self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_g
+            )
+        output_h = apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, output_h)
+
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if there are there
+        return outputs
+
+    def ff_chunk(self, output_x):
+        output_x = self.ff(output_x)
+        return output_x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->XLNet
+class XLNetPoolerStartLogits(nn.Module):
+    """
+    Compute SQuAD start logits from sequence hidden states.
+
+    Args:
+        config ([`XLNetConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        Returns:
+            `torch.FloatTensor`: The start logits for SQuAD.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->XLNet
+class XLNetPoolerEndLogits(nn.Module):
+    """
+    Compute SQuAD end logits from sequence hidden states.
+
+    Args:
+        config ([`XLNetConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The end logits for SQuAD.
+        """
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->XLNet
+class XLNetPoolerAnswerClass(nn.Module):
+    """
+    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
+
+    Args:
+        config ([`XLNetConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
+        """
+        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->XLNet
+class XLNetSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`XLNetConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+class XLNetPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XLNetConfig
+    load_tf_weights = load_tf_weights_in_xlnet
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XLNetRelativeAttention):
+            for param in [
+                module.q,
+                module.k,
+                module.v,
+                module.o,
+                module.r,
+                module.r_r_bias,
+                module.r_s_bias,
+                module.r_w_bias,
+                module.seg_embed,
+            ]:
+                param.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, XLNetModel):
+            module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+
+@dataclass
+class XLNetModelOutput(ModelOutput):
+    """
+    Output type of [`XLNetModel`].
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_predict, hidden_size)`):
+            Sequence of hidden-states at the last layer of the model.
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetLMHeadModelOutput(ModelOutput):
+    """
+    Output type of [`XLNetLMHeadModel`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, num_predict, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetForSequenceClassificationOutput(ModelOutput):
+    """
+    Output type of [`XLNetForSequenceClassification`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetForTokenClassificationOutput(ModelOutput):
+    """
+    Output type of [`XLNetForTokenClassificationOutput`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetForMultipleChoiceOutput(ModelOutput):
+    """
+    Output type of [`XLNetForMultipleChoice`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
+            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+            Classification scores (before SoftMax).
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
+    """
+    Output type of [`XLNetForQuestionAnsweringSimple`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class XLNetForQuestionAnsweringOutput(ModelOutput):
+    """
+    Output type of [`XLNetForQuestionAnswering`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    mems: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+XLNET_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XLNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLNET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        mems (`List[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (see `mems` output below) . Can be used to speed up sequential
+            decoding. The token ids which have their past given to this model should not be passed as `input_ids` as
+            they have already been computed.
+
+            `use_mems` has to be set to `True` to make use of `mems`.
+        perm_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+            Mask to indicate the attention pattern for each input token with values selected in `[0, 1]`:
+
+            - if `perm_mask[k, i, j] = 0`, i attend to j in batch k;
+            - if `perm_mask[k, i, j] = 1`, i does not attend to j in batch k.
+
+            If not set, each token attends to all the others (full bidirectional attention). Only used during
+            pretraining (to define factorization order) or for sequential decoding (generation).
+        target_mapping (`torch.FloatTensor` of shape `(batch_size, num_predict, sequence_length)`, *optional*):
+            Mask to indicate the output tokens to use. If `target_mapping[k, i, j] = 1`, the i-th predict in batch k is
+            on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
+            (generation).
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        input_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
+            Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0 for
+            real tokens and 1 for padding which is kept for compatibility with the original code base.
+
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **masked**,
+            - 0 for tokens that are **not masked**.
+
+            You can only uses one of `input_mask` and `attention_mask`.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
+    XLNET_START_DOCSTRING,
+)
+class XLNetModel(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
+
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
+        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
+        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
+        self.dropout = nn.Dropout(config.dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embedding
+
+    def set_input_embeddings(self, new_embeddings):
+        self.word_embedding = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def create_mask(self, qlen, mlen):
+        """
+        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+
+        Args:
+            qlen: Sequence length
+            mlen: Mask length
+
+        ::
+
+                  same_length=False: same_length=True: <mlen > < qlen > <mlen > < qlen >
+               ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1]
+                 [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1]
+            qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1]
+                 [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1]
+               v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
+
+        """
+        mask = torch.ones((qlen, qlen + mlen), device=self.device)
+        if self.same_length:
+            mask_lo = mask[:, :qlen].tril(-1)
+            mask.triu_(mlen + 1)
+            mask[:, :qlen] += mask_lo
+        else:
+            mask.triu_(mlen + 1)
+
+        return mask
+
+    def cache_mem(self, curr_out, prev_mem):
+        # cache hidden states into memory.
+        if self.reuse_len is not None and self.reuse_len > 0:
+            curr_out = curr_out[: self.reuse_len]
+
+        if self.mem_len is None or self.mem_len == 0:
+            # If `use_mems` is active but no `mem_len` is defined, the model behaves like GPT-2 at inference time
+            # and returns all of the past and current hidden states.
+            cutoff = 0
+        else:
+            # If `use_mems` is active and `mem_len` is defined, the model returns the last `mem_len` hidden
+            # states. This is the preferred setting for training and long-form generation.
+            cutoff = -self.mem_len
+        if prev_mem is None:
+            # if `use_mems` is active and `mem_len` is defined, the model
+            new_mem = curr_out[cutoff:]
+        else:
+            new_mem = torch.cat([prev_mem, curr_out], dim=0)[cutoff:]
+
+        return new_mem.detach()
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        sinusoid_inp = torch.einsum("i,d->id", pos_seq, inv_freq)
+        pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
+        pos_emb = pos_emb[:, None, :]
+
+        if bsz is not None:
+            pos_emb = pos_emb.expand(-1, bsz, -1)
+
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
+        # create relative positional encoding.
+        freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.int64).float()
+        inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model))
+
+        if self.attn_type == "bi":
+            # beg, end = klen - 1, -qlen
+            beg, end = klen, -qlen
+        elif self.attn_type == "uni":
+            # beg, end = klen - 1, -1
+            beg, end = klen, -1
+        else:
+            raise ValueError(f"Unknown `attn_type` {self.attn_type}.")
+
+        if self.bi_data:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
+            bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.int64).float()
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+
+            pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
+        else:
+            fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.int64).float()
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+
+        return pos_emb
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XLNetModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete after depreciation warning is removed
+    ) -> Union[Tuple, XLNetModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if "use_cache" in kwargs:
+            warnings.warn(
+                "The `use_cache` argument is deprecated and will be removed in a future version, use `use_mems`"
+                " instead.",
+                FutureWarning,
+            )
+            use_mems = kwargs["use_cache"]
+
+        if self.training:
+            use_mems = use_mems if use_mems is not None else self.config.use_mems_train
+        else:
+            use_mems = use_mems if use_mems is not None else self.config.use_mems_eval
+
+        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
+        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
+        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
+        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
+        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
+
+        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
+        klen = mlen + qlen
+
+        dtype_float = self.dtype
+        device = self.device
+
+        # Attention mask
+        # causal attention mask
+        if self.attn_type == "uni":
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = attn_mask[:, :, None, None]
+        elif self.attn_type == "bi":
+            attn_mask = None
+        else:
+            raise ValueError(f"Unsupported attention type: {self.attn_type}")
+
+        # data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = input_mask[None] + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = input_mask[None]
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # all mems can be attended to
+            if mlen > 0:
+                mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask)
+                data_mask = torch.cat([mems_mask, data_mask], dim=1)
+            if attn_mask is None:
+                attn_mask = data_mask[:, :, :, None]
+            else:
+                attn_mask += data_mask[:, :, :, None]
+
+        if attn_mask is not None:
+            attn_mask = (attn_mask > 0).to(dtype_float)
+
+        if attn_mask is not None:
+            non_tgt_mask = -torch.eye(qlen).to(attn_mask)
+            if mlen > 0:
+                non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1)
+            non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask)
+        else:
+            non_tgt_mask = None
+
+        # Word embeddings and prepare h & g hidden states
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            word_emb_k = self.word_embedding(input_ids)
+        output_h = self.dropout(word_emb_k)
+        if target_mapping is not None:
+            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
+            # else:  # We removed the inp_q input which was same as target mapping
+            #     inp_q_ext = inp_q[:, :, None]
+            #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
+            output_g = self.dropout(word_emb_q)
+        else:
+            output_g = None
+
+        # Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            if mlen > 0:
+                mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device)
+                cat_ids = torch.cat([mem_pad, token_type_ids], dim=0)
+            else:
+                cat_ids = token_type_ids
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long()
+            seg_mat = nn.functional.one_hot(seg_mat, num_classes=2).to(dtype_float)
+        else:
+            seg_mat = None
+
+        # Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
+        pos_emb = pos_emb.to(output_h.device)
+        pos_emb = self.dropout(pos_emb)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to float if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        new_mems = ()
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        attentions = [] if output_attentions else None
+        hidden_states = [] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            if use_mems:
+                # cache new mems
+                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
+            if output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module(
+                output_h,
+                output_g,
+                attn_mask_h=non_tgt_mask,
+                attn_mask_g=attn_mask,
+                r=pos_emb,
+                seg_mat=seg_mat,
+                mems=mems[i],
+                target_mapping=target_mapping,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+            output_h, output_g = outputs[:2]
+            if output_attentions:
+                attentions.append(outputs[2])
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+        output = self.dropout(output_g if output_g is not None else output_h)
+
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        output = output.permute(1, 0, 2).contiguous()
+
+        if not use_mems:
+            new_mems = None
+
+        if output_hidden_states:
+            if output_g is not None:
+                hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs)
+            else:
+                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
+
+        if output_attentions:
+            if target_mapping is not None:
+                # when target_mapping is provided, there are 2-tuple of attentions
+                attentions = tuple(
+                    tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions
+                )
+            else:
+                attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+
+        if not return_dict:
+            return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
+
+        return XLNetModelOutput(
+            last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetLMHeadModel(XLNetPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_loss.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.attn_type = config.attn_type
+        self.same_length = config.same_length
+
+        self.transformer = XLNetModel(config)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_loss
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_loss = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_mems=None, **kwargs):
+        # Overwritten -- this model has unique input preparation
+
+        # Add dummy token at the end (no attention on this one)
+
+        effective_batch_size = input_ids.shape[0]
+        dummy_token = torch.zeros((effective_batch_size, 1), dtype=torch.long, device=input_ids.device)
+
+        # At every pass, the attention values for the new token and the two last generated tokens
+        # are computed, the rest is reloaded from the `past` cache. A purely auto-regressive model would have
+        # offset = 1; offset = 2 seems to have slightly better computation.
+        offset = 2
+
+        if past_key_values:
+            input_ids = torch.cat([input_ids[:, -offset:], dummy_token], dim=1)
+        else:
+            input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        # Build permutation mask so that previous tokens don't see last token
+        sequence_length = input_ids.shape[1]
+        perm_mask = torch.zeros(
+            (effective_batch_size, sequence_length, sequence_length), dtype=torch.float, device=input_ids.device
+        )
+        perm_mask[:, :, -1] = 1.0
+
+        # We'll only predict the last token
+        target_mapping = torch.zeros(
+            (effective_batch_size, 1, sequence_length), dtype=torch.float, device=input_ids.device
+        )
+        target_mapping[:, 0, -1] = 1.0
+
+        inputs = {
+            "input_ids": input_ids,
+            "perm_mask": perm_mask,
+            "target_mapping": target_mapping,
+            "use_mems": use_mems,
+        }
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            inputs["mems"] = tuple(layer_past[:-offset, :, :] for layer_past in past_key_values)
+
+        return inputs
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=XLNetLMHeadModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetLMHeadModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, num_predict)`, *optional*):
+            Labels for masked language modeling. `num_predict` corresponds to `target_mapping.shape[1]`. If
+            `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
+
+            The labels should correspond to the masked input words that should be predicted and depends on
+            `target_mapping`. Note in order to perform standard auto-regressive language modeling a *<mask>* token has
+            to be added to the `input_ids` (see the `prepare_inputs_for_generation` function and examples below)
+
+            Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored, the loss
+            is only computed for labels in `[0, ..., config.vocab_size]`
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLNetLMHeadModel
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-large-cased")
+        >>> model = XLNetLMHeadModel.from_pretrained("xlnet/xlnet-large-cased")
+
+        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
+        >>> input_ids = torch.tensor(
+        ...     tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
+        ... ).unsqueeze(
+        ...     0
+        ... )  # We will predict the masked token
+        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        >>> target_mapping = torch.zeros(
+        ...     (1, 1, input_ids.shape[1]), dtype=torch.float
+        ... )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        ... ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
+        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        >>> next_token_logits = outputs[
+        ...     0
+        ... ]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+
+        >>> # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
+        >>> input_ids = torch.tensor(
+        ...     tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
+        ... ).unsqueeze(
+        ...     0
+        ... )  # We will predict the masked token
+        >>> labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
+        >>> assert labels.shape[0] == 1, "only one word will be predicted"
+        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        >>> perm_mask[
+        ...     :, :, -1
+        ... ] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
+        >>> target_mapping = torch.zeros(
+        ...     (1, 1, input_ids.shape[1]), dtype=torch.float
+        ... )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        ... ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+
+        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
+        >>> loss = outputs.loss
+        >>> next_token_logits = (
+        ...     outputs.logits
+        ... )  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        logits = self.lm_loss(transformer_outputs[0])
+
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XLNetLMHeadModelOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[torch.Tensor]:
+        """
+        This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every
+        generation step.
+        """
+        return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetForSequenceClassification(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.transformer = XLNetModel(config)
+        self.sequence_summary = XLNetSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.d_model, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XLNetForSequenceClassificationOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetForSequenceClassificationOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+        output = transformer_outputs[0]
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XLNetForSequenceClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetForTokenClassification(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLNetModel(config)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XLNetForTokenClassificationOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetForTokenClassificationOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XLNetForTokenClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=outputs.mems,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RACE/SWAG tasks.
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetForMultipleChoice(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = XLNetModel(config)
+        self.sequence_summary = XLNetSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.d_model, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XLNetForMultipleChoiceOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetForMultipleChoiceOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            input_mask=flat_input_mask,
+            attention_mask=flat_attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        output = transformer_outputs[0]
+
+        output = self.sequence_summary(output)
+        logits = self.logits_proj(output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels.view(-1))
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XLNetForMultipleChoiceOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XLNetForQuestionAnsweringSimpleOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetForQuestionAnsweringSimpleOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return XLNetForQuestionAnsweringSimpleOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            mems=outputs.mems,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XLNET_START_DOCSTRING,
+)
+class XLNetForQuestionAnswering(XLNetPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.transformer = XLNetModel(config)
+        self.start_logits = XLNetPoolerStartLogits(config)
+        self.end_logits = XLNetPoolerEndLogits(config)
+        self.answer_class = XLNetPoolerAnswerClass(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XLNET_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=XLNetForQuestionAnsweringOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        mems: Optional[torch.Tensor] = None,
+        perm_mask: Optional[torch.Tensor] = None,
+        target_mapping: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        input_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        is_impossible: Optional[torch.Tensor] = None,
+        cls_index: Optional[torch.Tensor] = None,
+        p_mask: Optional[torch.Tensor] = None,
+        use_mems: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # delete when `use_cache` is removed in XLNetModel
+    ) -> Union[Tuple, XLNetForQuestionAnsweringOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
+            masked. 0.0 mean token is not masked.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLNetForQuestionAnswering
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("xlnet/xlnet-base-cased")
+        >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet/xlnet-base-cased")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        ... )  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            token_type_ids=token_type_ids,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems=use_mems,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+        hidden_states = transformer_outputs[0]
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        outputs = transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            if not return_dict:
+                return (total_loss,) + transformer_outputs[1:]
+            else:
+                return XLNetForQuestionAnsweringOutput(
+                    loss=total_loss,
+                    mems=transformer_outputs.mems,
+                    hidden_states=transformer_outputs.hidden_states,
+                    attentions=transformer_outputs.attentions,
+                )
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum(
+                "blh,bl->bh", hidden_states, start_log_probs
+            )  # get the representation of START as weighted sum of hidden states
+            cls_logits = self.answer_class(
+                hidden_states, start_states=start_states, cls_index=cls_index
+            )  # Shape (batch size,): one single `cls_logits` for each sample
+
+            if not return_dict:
+                outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+                return outputs + transformer_outputs[1:]
+            else:
+                return XLNetForQuestionAnsweringOutput(
+                    start_top_log_probs=start_top_log_probs,
+                    start_top_index=start_top_index,
+                    end_top_log_probs=end_top_log_probs,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                    mems=transformer_outputs.mems,
+                    hidden_states=transformer_outputs.hidden_states,
+                    attentions=transformer_outputs.attentions,
+                )
+
+
+__all__ = [
+    "XLNetForMultipleChoice",
+    "XLNetForQuestionAnswering",
+    "XLNetForQuestionAnsweringSimple",
+    "XLNetForSequenceClassification",
+    "XLNetForTokenClassification",
+    "XLNetLMHeadModel",
+    "XLNetModel",
+    "XLNetPreTrainedModel",
+    "load_tf_weights_in_xlnet",
+]
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet.py b/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f956fe84cb400916c0e2f0ed95950b3b7d4e606
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet.py
@@ -0,0 +1,387 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for XLNet model."""
+
+import os
+import unicodedata
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import SPIECE_UNDERLINE, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+# Segments (not really needed)
+SEG_ID_A = 0
+SEG_ID_B = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+
+
+@requires(backends=("sentencepiece",))
+class XLNetTokenizer(PreTrainedTokenizer):
+    """
+    Construct an XLNet tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether to keep accents when tokenizing.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<eop>', '<eod>']`):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self._pad_token_type_id = 3
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = True,
+        **kwargs,
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        # Mimic the behavior of the Rust tokenizer:
+        # By default, there are no spaces between special tokens
+        text = "".join(sub_texts)
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLNet sequence has the following format:
+
+        - single sequence: `X <sep> <cls>`
+        - pair of sequences: `A <sep> B <sep> <cls>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return token_ids_0 + sep + cls
+        return token_ids_0 + sep + token_ids_1 + sep + cls
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+        return ([0] * len(token_ids_0)) + [1, 1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls_segment_id = [2]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0] + cls_segment_id
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XLNetTokenizer"]
diff --git a/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet_fast.py b/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a6ae907ef573f16cb46ad7eae335ac6894e0c37
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -0,0 +1,234 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for XLNet model."""
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_xlnet import XLNetTokenizer
+else:
+    XLNetTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+# Segments (not really needed)
+SEG_ID_A = 0
+SEG_ID_B = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+
+
+class XLNetTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" XLNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether to keep accents when tokenizing.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
+            Additional special tokens used by the tokenizer.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    padding_side = "left"
+    slow_tokenizer_class = XLNetTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self._pad_token_type_id = 3
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLNet sequence has the following format:
+
+        - single sequence: `X <sep> <cls>`
+        - pair of sequences: `A <sep> B <sep> <cls>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return token_ids_0 + sep + cls
+        return token_ids_0 + sep + token_ids_1 + sep + cls
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls_segment_id = [2]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0] + cls_segment_id
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["XLNetTokenizerFast"]
diff --git a/docs/transformers/build/lib/transformers/models/xmod/__init__.py b/docs/transformers/build/lib/transformers/models/xmod/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cf20cbe49cbafadbcaa3b8aa8eba9d34238f14
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xmod/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xmod import *
+    from .modeling_xmod import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/xmod/configuration_xmod.py b/docs/transformers/build/lib/transformers/models/xmod/configuration_xmod.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ad14307f053d44889d3bc9ce39d121434fe352
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xmod/configuration_xmod.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright 2023 The Meta AI Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""X-MOD configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XmodConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XmodModel`]. It is used to instantiate an X-MOD
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [facebook/xmod-base](https://huggingface.co/facebook/xmod-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the X-MOD model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XmodModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`XmodModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization before each block.
+        adapter_reduction_factor (`int` or `float`, *optional*, defaults to 2):
+            The factor by which the dimensionality of the adapter is reduced relative to `hidden_size`.
+        adapter_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply a new layer normalization before the adapter modules (shared across all adapters).
+        adapter_reuse_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether to reuse the second layer normalization and apply it before the adapter modules as well.
+        ln_before_adapter (`bool`, *optional*, defaults to `True`):
+            Whether to apply the layer normalization before the residual connection around the adapter module.
+        languages (`Iterable[str]`, *optional*, defaults to `["en_XX"]`):
+            An iterable of language codes for which adapter modules should be initialized.
+        default_language (`str`, *optional*):
+            Language code of a default language. It will be assumed that the input is in this language if no language
+            codes are explicitly passed to the forward method.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XmodConfig, XmodModel
+
+    >>> # Initializing an X-MOD facebook/xmod-base style configuration
+    >>> configuration = XmodConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/xmod-base style configuration
+    >>> model = XmodModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xmod"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        pre_norm=False,
+        adapter_reduction_factor=2,
+        adapter_layer_norm=False,
+        adapter_reuse_layer_norm=True,
+        ln_before_adapter=True,
+        languages=("en_XX",),
+        default_language=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.pre_norm = pre_norm
+        self.adapter_reduction_factor = adapter_reduction_factor
+        self.adapter_layer_norm = adapter_layer_norm
+        self.adapter_reuse_layer_norm = adapter_reuse_layer_norm
+        self.ln_before_adapter = ln_before_adapter
+        self.languages = list(languages)
+        self.default_language = default_language
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->Xmod
+class XmodOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["XmodConfig", "XmodOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6352b713005518a3bba0af6c32aa425a6c5e8e78
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xmod/convert_xmod_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert X-MOD checkpoint."""
+
+import argparse
+from pathlib import Path
+
+import fairseq
+import torch
+from fairseq.models.xmod import XMODModel as FairseqXmodModel
+from packaging import version
+
+from transformers import XmodConfig, XmodForMaskedLM, XmodForSequenceClassification
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("0.12.2"):
+    raise Exception("requires fairseq >= 0.12.2")
+if version.parse(fairseq.__version__) > version.parse("2"):
+    raise Exception("requires fairseq < v2")
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello, World!"
+SAMPLE_LANGUAGE = "en_XX"
+
+
+def convert_xmod_checkpoint_to_pytorch(
+    xmod_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+    data_dir = Path("data_bin")
+    xmod = FairseqXmodModel.from_pretrained(
+        model_name_or_path=str(Path(xmod_checkpoint_path).parent),
+        checkpoint_file=Path(xmod_checkpoint_path).name,
+        _name="xmod_base",
+        arch="xmod_base",
+        task="multilingual_masked_lm",
+        data_name_or_path=str(data_dir),
+        bpe="sentencepiece",
+        sentencepiece_model=str(Path(xmod_checkpoint_path).parent / "sentencepiece.bpe.model"),
+        src_dict=str(data_dir / "dict.txt"),
+    )
+    xmod.eval()  # disable dropout
+    print(xmod)
+
+    xmod_sent_encoder = xmod.model.encoder.sentence_encoder
+    config = XmodConfig(
+        vocab_size=xmod_sent_encoder.embed_tokens.num_embeddings,
+        hidden_size=xmod.cfg.model.encoder_embed_dim,
+        num_hidden_layers=xmod.cfg.model.encoder_layers,
+        num_attention_heads=xmod.cfg.model.encoder_attention_heads,
+        intermediate_size=xmod.cfg.model.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
+        pre_norm=xmod.cfg.model.encoder_normalize_before,
+        adapter_reduction_factor=getattr(xmod.cfg.model, "bottleneck", 2),
+        adapter_layer_norm=xmod.cfg.model.adapter_layer_norm,
+        adapter_reuse_layer_norm=xmod.cfg.model.adapter_reuse_layer_norm,
+        ln_before_adapter=xmod.cfg.model.ln_before_adapter,
+        languages=xmod.cfg.model.languages,
+    )
+    if classification_head:
+        config.num_labels = xmod.model.classification_heads["mnli"].out_proj.weight.shape[0]
+
+    print("Our X-MOD config:", config)
+
+    model = XmodForSequenceClassification(config) if classification_head else XmodForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    model.roberta.embeddings.word_embeddings.weight = xmod_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = xmod_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c xmod doesn't use them.
+
+    model.roberta.embeddings.LayerNorm.weight = xmod_sent_encoder.layernorm_embedding.weight
+    model.roberta.embeddings.LayerNorm.bias = xmod_sent_encoder.layernorm_embedding.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer = model.roberta.encoder.layer[i]
+        xmod_layer = xmod_sent_encoder.layers[i]
+
+        # self attention
+        self_attn = layer.attention.self
+        if not (
+            xmod_layer.self_attn.k_proj.weight.data.shape
+            == xmod_layer.self_attn.q_proj.weight.data.shape
+            == xmod_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
+        ):
+            raise AssertionError("Dimensions of self-attention weights do not match.")
+
+        self_attn.query.weight.data = xmod_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = xmod_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = xmod_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = xmod_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = xmod_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = xmod_layer.self_attn.v_proj.bias
+
+        # self-attention output
+        self_output = layer.attention.output
+        if self_output.dense.weight.shape != xmod_layer.self_attn.out_proj.weight.shape:
+            raise AssertionError("Dimensions of self-attention output weights do not match.")
+        self_output.dense.weight = xmod_layer.self_attn.out_proj.weight
+        self_output.dense.bias = xmod_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = xmod_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = xmod_layer.self_attn_layer_norm.bias
+
+        # intermediate
+        intermediate = layer.intermediate
+        if intermediate.dense.weight.shape != xmod_layer.fc1.weight.shape:
+            raise AssertionError("Dimensions of intermediate weights do not match.")
+        intermediate.dense.weight = xmod_layer.fc1.weight
+        intermediate.dense.bias = xmod_layer.fc1.bias
+
+        # output
+        bert_output = layer.output
+        if bert_output.dense.weight.shape != xmod_layer.fc2.weight.shape:
+            raise AssertionError("Dimensions of feed-forward weights do not match.")
+        bert_output.dense.weight = xmod_layer.fc2.weight
+        bert_output.dense.bias = xmod_layer.fc2.bias
+        bert_output.LayerNorm.weight = xmod_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = xmod_layer.final_layer_norm.bias
+        if bert_output.adapter_layer_norm is not None:
+            bert_output.adapter_layer_norm.weight = xmod_layer.adapter_layer_norm.weight
+            bert_output.adapter_layer_norm.bias = xmod_layer.adapter_layer_norm.bias
+
+        if sorted(bert_output.adapter_modules.keys()) != sorted(xmod_layer.adapter_modules.keys()):
+            raise AssertionError("Lists of language adapters do not match.")
+        for lang_code, adapter in xmod_layer.adapter_modules.items():
+            to_adapter = bert_output.adapter_modules[lang_code]
+            from_adapter = xmod_layer.adapter_modules[lang_code]
+            to_adapter.dense1.weight = from_adapter.fc1.weight
+            to_adapter.dense1.bias = from_adapter.fc1.bias
+            to_adapter.dense2.weight = from_adapter.fc2.weight
+            to_adapter.dense2.bias = from_adapter.fc2.bias
+
+        # end of layer
+
+    if xmod_sent_encoder.layer_norm is not None:
+        model.roberta.encoder.LayerNorm.weight = xmod_sent_encoder.layer_norm.weight
+        model.roberta.encoder.LayerNorm.bias = xmod_sent_encoder.layer_norm.bias
+
+    if classification_head:
+        model.classifier.dense.weight = xmod.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = xmod.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = xmod.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = xmod.model.classification_heads["mnli"].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = xmod.model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = xmod.model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = xmod.model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = xmod.model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = xmod.model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = xmod.model.encoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids = xmod.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
+    model.roberta.set_default_language(SAMPLE_LANGUAGE)
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = xmod.model.classification_heads["mnli"](xmod.extract_features(input_ids))
+    else:
+        their_output = xmod.model(input_ids, lang_id=[SAMPLE_LANGUAGE])[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--xmod_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
+    args = parser.parse_args()
+    convert_xmod_checkpoint_to_pytorch(
+        args.xmod_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+    )
diff --git a/docs/transformers/build/lib/transformers/models/xmod/modeling_xmod.py b/docs/transformers/build/lib/transformers/models/xmod/modeling_xmod.py
new file mode 100644
index 0000000000000000000000000000000000000000..21aad7188e0e267fb181143187bb90f59525de64
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/xmod/modeling_xmod.py
@@ -0,0 +1,1637 @@
+# coding=utf-8
+# Copyright 2023 Meta AI Team and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch X-MOD model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_xmod import XmodConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Xmod
+class XmodEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Xmod
+class XmodSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in XmodModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class XmodSelfOutput(nn.Module):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class XmodAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = XmodSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = XmodSelfOutput(config)
+        self.pruned_heads = set()
+        self.pre_norm = config.pre_norm
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        residual = hidden_states
+        if self.pre_norm:
+            hidden_states = self.output.LayerNorm(hidden_states)
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], residual)
+        if not self.pre_norm:
+            attention_output = self.output.LayerNorm(attention_output)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate
+class XmodIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class XmodAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.bottleneck_size = config.hidden_size // config.adapter_reduction_factor
+        self.dense1 = nn.Linear(config.hidden_size, self.bottleneck_size)
+        self.dense2 = nn.Linear(self.bottleneck_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.adapter_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.adapter_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.adapter_act_fn(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        return hidden_states
+
+
+class XmodOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.ln_before_adapter = config.ln_before_adapter
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.adapter_layer_norm:
+            self.adapter_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.adapter_layer_norm = None
+        self.adapter_reuse_layer_norm = config.adapter_reuse_layer_norm
+        self.adapter_modules = nn.ModuleDict({})
+        for language in config.languages:
+            self.adapter_modules[str(language)] = XmodAdapter(config)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, lang_ids: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        hidden_states = self.lang_adapter(lang_ids, hidden_states)
+        return hidden_states
+
+    def lang_adapter(self, lang_ids: torch.Tensor, hidden_states: torch.Tensor):
+        # Process subsequent samples with the same lang_id in parallel
+        lang_ids, lang_lengths = torch.unique_consecutive(lang_ids, return_counts=True)
+
+        if not self.ln_before_adapter:
+            residual = hidden_states
+
+        if self.adapter_layer_norm is not None:
+            hidden_states = self.adapter_layer_norm(hidden_states)
+        elif self.adapter_reuse_layer_norm:
+            hidden_states = self.LayerNorm(hidden_states)
+
+        if self.ln_before_adapter:
+            residual = hidden_states
+
+        split_hidden_states = torch.split(hidden_states, lang_lengths.tolist(), 0)
+        lang_wise_outputs = []
+        for i, (lang_id, split_hidden_state) in enumerate(zip(lang_ids, split_hidden_states)):
+            lang = list(self.adapter_modules.keys())[int(lang_id.item())]
+            lang_wise_outputs.append(self.adapter_modules[lang](split_hidden_state))
+        hidden_states = torch.cat(lang_wise_outputs, 0)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states += residual
+        return hidden_states
+
+
+class XmodLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = XmodAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = XmodAttention(config, position_embedding_type="absolute")
+        self.intermediate = XmodIntermediate(config)
+        self.output = XmodOutput(config)
+        self.pre_norm = config.pre_norm
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lang_ids: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        residual = attention_output
+        if self.pre_norm:
+            attention_output = self.output.LayerNorm(attention_output)
+        intermediate_output = apply_chunking_to_forward(
+            self.feed_forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+        layer_output = self.output(intermediate_output, residual, lang_ids)
+        if not self.pre_norm:
+            layer_output = self.output.LayerNorm(layer_output)
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        return self.intermediate(attention_output)
+
+
+class XmodEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([XmodLayer(config) for _ in range(config.num_hidden_layers)])
+        self.is_pre_norm = config.pre_norm
+        if self.is_pre_norm:
+            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lang_ids: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    lang_ids,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    lang_ids,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if self.is_pre_norm:
+            hidden_states = self.LayerNorm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
+class XmodPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class XmodPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = XmodConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->XmodLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, XmodLMHead):
+            module.bias.data.zero_()
+
+    def set_default_language(self, language: str):
+        """
+        Set the default language code for the model. This is used when the language is not specified in the input.
+
+        Args:
+            language (`str`): The language code, such as `"en_XX"` or `"de_DE"`.
+        """
+        if language not in self.config.languages:
+            raise ValueError(
+                f"{self} does not have an adapter for {language}. Supported languages: {list(self.config.languages)}"
+            )
+        self.config.default_language = language
+
+    def freeze_embeddings_and_language_adapters(self):
+        """
+        Freeze the embeddings and language adapters of the model. Usually, this is applied before the model is
+        fine-tuned on a downstream task.
+        """
+        logger.info("Freezing embeddings")
+        for parameter in self.roberta.embeddings.parameters():
+            parameter.requires_grad = False
+        logger.info("Freezing adapters")
+        for layer in self.roberta.encoder.layer:
+            if layer.output.adapter_layer_norm is not None:
+                for parameter in layer.output.adapter_layer_norm.parameters():
+                    parameter.requires_grad = False
+            for parameter in layer.output.adapter_modules.parameters():
+                parameter.requires_grad = False
+
+
+XMOD_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`XmodConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XMOD_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        lang_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of the language adapters that should be activated for each sample, respectively. Default: the index
+            that corresponds to `self.config.default_language`.
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare X-MOD Model transformer outputting raw hidden-states without any specific head on top.",
+    XMOD_START_DOCSTRING,
+)
+class XmodModel(XmodPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Xmod
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = XmodEmbeddings(config)
+        self.encoder = XmodEncoder(config)
+
+        self.pooler = XmodPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors:
+        of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if lang_ids is None:
+            if self.config.default_language is None:
+                raise ValueError("Input language unknown. Please call `XmodPreTrainedModel.set_default_language()`")
+            adapter_languages = list(self.encoder.layer[0].output.adapter_modules.keys())
+            default_lang_id = adapter_languages.index(self.config.default_language)
+            lang_ids = default_lang_id * torch.ones(batch_size, device=device)
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            lang_ids=lang_ids,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "X-MOD Model with a `language modeling` head on top for CLM fine-tuning.",
+    XMOD_START_DOCSTRING,
+)
+class XmodForCausalLM(XmodPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `XmodLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.lm_head = XmodLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns: `transformers.modeling_outputs.CausalLMOutputWithCrossAttentions` or `tuple(torch.FloatTensor)`
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XmodForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
+        >>> config = AutoConfig.from_pretrained("facebook/xmod-base")
+        >>> config.is_decoder = True
+        >>> model = XmodForCausalLM.from_pretrained("facebook/xmod-base", config=config)
+        >>> model.set_default_language("en_XX")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM._reorder_cache
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """X-MOD Model with a `language modeling` head on top.""",
+    XMOD_START_DOCSTRING,
+)
+class XmodForMaskedLM(XmodPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `XmodForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.lm_head = XmodLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
+class XmodLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForSequenceClassification(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.classifier = XmodClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForMultipleChoice(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = XmodModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_lang_ids = lang_ids.repeat(input_ids.size(0) * input_ids.size(1)) if lang_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            lang_ids=flat_lang_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForTokenClassification(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead
+class XmodClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    X-MOD Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    XMOD_START_DOCSTRING,
+)
+class XmodForQuestionAnswering(XmodPreTrainedModel):
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = XmodModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(XMOD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        lang_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            lang_ids=lang_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "XmodForCausalLM",
+    "XmodForMaskedLM",
+    "XmodForMultipleChoice",
+    "XmodForQuestionAnswering",
+    "XmodForSequenceClassification",
+    "XmodForTokenClassification",
+    "XmodModel",
+    "XmodPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/__init__.py b/docs/transformers/build/lib/transformers/models/yolos/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7ce96987e40700eb32f32e3aa0ae6c29d850b54
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_yolos import *
+    from .feature_extraction_yolos import *
+    from .image_processing_yolos import *
+    from .image_processing_yolos_fast import *
+    from .modeling_yolos import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/yolos/configuration_yolos.py b/docs/transformers/build/lib/transformers/models/yolos/configuration_yolos.py
new file mode 100644
index 0000000000000000000000000000000000000000..a488216cf8da5562f03eee84d82ff787b014c409
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/configuration_yolos.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""YOLOS model configuration"""
+
+from collections import OrderedDict
+from typing import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class YolosConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`YolosModel`]. It is used to instantiate a YOLOS
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the YOLOS
+    [hustvl/yolos-base](https://huggingface.co/hustvl/yolos-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`List[int]`, *optional*, defaults to `[512, 864]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        num_detection_tokens (`int`, *optional*, defaults to 100):
+            The number of detection tokens.
+        use_mid_position_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to use the mid-layer position encodings.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Example:
+
+    ```python
+    >>> from transformers import YolosConfig, YolosModel
+
+    >>> # Initializing a YOLOS hustvl/yolos-base style configuration
+    >>> configuration = YolosConfig()
+
+    >>> # Initializing a model (with random weights) from the hustvl/yolos-base style configuration
+    >>> model = YolosModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "yolos"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=[512, 864],
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        num_detection_tokens=100,
+        use_mid_position_embeddings=True,
+        auxiliary_loss=False,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.num_detection_tokens = num_detection_tokens
+        self.use_mid_position_embeddings = use_mid_position_embeddings
+        self.auxiliary_loss = auxiliary_loss
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+
+
+class YolosOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+
+
+__all__ = ["YolosConfig", "YolosOnnxConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/convert_yolos_to_pytorch.py b/docs/transformers/build/lib/transformers/models/yolos/convert_yolos_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..907a11d067bff58ac27de97be316c83d263f433f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/convert_yolos_to_pytorch.py
@@ -0,0 +1,267 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert YOLOS checkpoints from the original repository. URL: https://github.com/hustvl/YOLOS"""
+
+import argparse
+import json
+from pathlib import Path
+
+import requests
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import YolosConfig, YolosForObjectDetection, YolosImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_yolos_config(yolos_name: str) -> YolosConfig:
+    config = YolosConfig()
+
+    # size of the architecture
+    if "yolos_ti" in yolos_name:
+        config.hidden_size = 192
+        config.intermediate_size = 768
+        config.num_hidden_layers = 12
+        config.num_attention_heads = 3
+        config.image_size = [800, 1333]
+        config.use_mid_position_embeddings = False
+    elif yolos_name == "yolos_s_dWr":
+        config.hidden_size = 330
+        config.num_hidden_layers = 14
+        config.num_attention_heads = 6
+        config.intermediate_size = 1320
+    elif "yolos_s" in yolos_name:
+        config.hidden_size = 384
+        config.intermediate_size = 1536
+        config.num_hidden_layers = 12
+        config.num_attention_heads = 6
+    elif "yolos_b" in yolos_name:
+        config.image_size = [800, 1344]
+
+    config.num_labels = 91
+    repo_id = "huggingface/label-files"
+    filename = "coco-detection-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict: dict, config: YolosConfig, base_model: bool = False):
+    for i in range(config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in timm, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight")
+        in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            config.hidden_size : config.hidden_size * 2, :
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[
+            config.hidden_size : config.hidden_size * 2
+        ]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :]
+        state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :]
+
+
+def rename_key(name: str) -> str:
+    if "backbone" in name:
+        name = name.replace("backbone", "vit")
+    if "cls_token" in name:
+        name = name.replace("cls_token", "embeddings.cls_token")
+    if "det_token" in name:
+        name = name.replace("det_token", "embeddings.detection_tokens")
+    if "mid_pos_embed" in name:
+        name = name.replace("mid_pos_embed", "encoder.mid_position_embeddings")
+    if "pos_embed" in name:
+        name = name.replace("pos_embed", "embeddings.position_embeddings")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
+    if "blocks" in name:
+        name = name.replace("blocks", "encoder.layer")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn" in name:
+        name = name.replace("attn", "attention.self")
+    if "norm1" in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "class_embed" in name:
+        name = name.replace("class_embed", "class_labels_classifier")
+    if "bbox_embed" in name:
+        name = name.replace("bbox_embed", "bbox_predictor")
+    if "vit.norm" in name:
+        name = name.replace("vit.norm", "vit.layernorm")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict: dict, model: YolosForObjectDetection) -> dict:
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[2])
+            dim = model.vit.encoder.layer[layer_num].attention.attention.all_head_size
+            if "weight" in key:
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.weight"] = val[
+                    dim : dim * 2, :
+                ]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+            else:
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.query.bias"] = val[:dim]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                orig_state_dict[f"vit.encoder.layer.{layer_num}.attention.attention.value.bias"] = val[-dim:]
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+# We will verify our results on an image of cute cats
+def prepare_img() -> torch.Tensor:
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_yolos_checkpoint(
+    yolos_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
+):
+    """
+    Copy/paste/tweak model's weights to our YOLOS structure.
+    """
+    config = get_yolos_config(yolos_name)
+
+    # load original state_dict
+    state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model"]
+
+    # load 🤗 model
+    model = YolosForObjectDetection(config)
+    model.eval()
+    new_state_dict = convert_state_dict(state_dict, model)
+    model.load_state_dict(new_state_dict)
+
+    # Check outputs on an image, prepared by YolosImageProcessor
+    size = 800 if yolos_name != "yolos_ti" else 512
+    image_processor = YolosImageProcessor(format="coco_detection", size=size)
+    encoding = image_processor(images=prepare_img(), return_tensors="pt")
+    outputs = model(**encoding)
+    logits, pred_boxes = outputs.logits, outputs.pred_boxes
+
+    expected_slice_logits, expected_slice_boxes = None, None
+    if yolos_name == "yolos_ti":
+        expected_slice_logits = torch.tensor(
+            [[-39.5022, -11.9820, -17.6888], [-29.9574, -9.9769, -17.7691], [-42.3281, -20.7200, -30.6294]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.4021, 0.0836, 0.7979], [0.0184, 0.2609, 0.0364], [0.1781, 0.2004, 0.2095]]
+        )
+    elif yolos_name == "yolos_s_200_pre":
+        expected_slice_logits = torch.tensor(
+            [[-24.0248, -10.3024, -14.8290], [-42.0392, -16.8200, -27.4334], [-27.2743, -11.8154, -18.7148]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.2559, 0.5455, 0.4706], [0.2989, 0.7279, 0.1875], [0.7732, 0.4017, 0.4462]]
+        )
+    elif yolos_name == "yolos_s_300_pre":
+        expected_slice_logits = torch.tensor(
+            [[-36.2220, -14.4385, -23.5457], [-35.6970, -14.7583, -21.3935], [-31.5939, -13.6042, -16.8049]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.7614, 0.2316, 0.4728], [0.7168, 0.4495, 0.3855], [0.4996, 0.1466, 0.9996]]
+        )
+    elif yolos_name == "yolos_s_dWr":
+        expected_slice_logits = torch.tensor(
+            [[-42.8668, -24.1049, -41.1690], [-34.7456, -14.1274, -24.9194], [-33.7898, -12.1946, -25.6495]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.5587, 0.2773, 0.0605], [0.5004, 0.3014, 0.9994], [0.4999, 0.1548, 0.9994]]
+        )
+    elif yolos_name == "yolos_base":
+        expected_slice_logits = torch.tensor(
+            [[-40.6064, -24.3084, -32.6447], [-55.1990, -30.7719, -35.5877], [-51.4311, -33.3507, -35.6462]]
+        )
+        expected_slice_boxes = torch.tensor(
+            [[0.5555, 0.2794, 0.0655], [0.9049, 0.2664, 0.1894], [0.9183, 0.1984, 0.1635]]
+        )
+    else:
+        raise ValueError(f"Unknown yolos_name: {yolos_name}")
+
+    assert torch.allclose(logits[0, :3, :3], expected_slice_logits, atol=1e-4)
+    assert torch.allclose(pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4)
+
+    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+    print(f"Saving model {yolos_name} to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+    print(f"Saving image processor to {pytorch_dump_folder_path}")
+    image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_mapping = {
+            "yolos_ti": "yolos-tiny",
+            "yolos_s_200_pre": "yolos-small",
+            "yolos_s_300_pre": "yolos-small-300",
+            "yolos_s_dWr": "yolos-small-dwr",
+            "yolos_base": "yolos-base",
+        }
+
+        print("Pushing to the hub...")
+        model_name = model_mapping[yolos_name]
+        image_processor.push_to_hub(model_name, organization="hustvl")
+        model.push_to_hub(model_name, organization="hustvl")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--yolos_name",
+        default="yolos_s_200_pre",
+        type=str,
+        help=(
+            "Name of the YOLOS model you'd like to convert. Should be one of 'yolos_ti', 'yolos_s_200_pre',"
+            " 'yolos_s_300_pre', 'yolos_s_dWr', 'yolos_base'."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoint_path", default=None, type=str, help="Path to the original state dict (.pth file)."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
+
+    args = parser.parse_args()
+    convert_yolos_checkpoint(args.yolos_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/docs/transformers/build/lib/transformers/models/yolos/feature_extraction_yolos.py b/docs/transformers/build/lib/transformers/models/yolos/feature_extraction_yolos.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c9165832f7b181503e48af42eae559400f170a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/feature_extraction_yolos.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for YOLOS."""
+
+import warnings
+
+from ...image_transforms import rgb_to_id as _rgb_to_id
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_yolos import YolosImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+def rgb_to_id(x):
+    warnings.warn(
+        "rgb_to_id has moved and will not be importable from this module from v5. "
+        "Please import from transformers.image_transforms instead.",
+        FutureWarning,
+    )
+    return _rgb_to_id(x)
+
+
+@requires(backends=("vision",))
+class YolosFeatureExtractor(YolosImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class YolosFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use YolosImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["YolosFeatureExtractor"]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos.py b/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1f0d1d2b085f96d42804c2587d8a0142182148
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos.py
@@ -0,0 +1,1536 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for YOLOS."""
+
+import pathlib
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import BaseImageProcessor, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+logger = logging.get_logger(__name__)
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
+def get_max_height_width(
+    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> List[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = np.mod(ow, mod_size)
+        oh_mod = np.mod(oh, mod_size)
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or `List[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+# Copied from transformers.models.detr.image_processing_detr.safe_squeeze
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+# Copied from transformers.models.detr.image_processing_detr.normalize_annotation
+def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> List[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`Tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->YOLOS
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for YOLOS.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image
+def get_segmentation_image(
+    masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False
+):
+    h, w = input_size
+    final_h, final_w = target_size
+
+    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+    if m_id.shape[-1] == 0:
+        # We didn't detect any mask :(
+        m_id = np.zeros((h, w), dtype=np.int64)
+    else:
+        m_id = m_id.argmax(-1).reshape(h, w)
+
+    if deduplicate:
+        # Merge the masks corresponding to the same stuff class
+        for equiv in stuff_equiv_classes.values():
+            for eq_id in equiv:
+                m_id[m_id == eq_id] = equiv[0]
+
+    seg_img = id_to_rgb(m_id)
+    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+    return seg_img
+
+
+# Copied from transformers.models.detr.image_processing_detr.get_mask_area
+def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray:
+    final_h, final_w = target_size
+    np_seg_img = seg_img.astype(np.uint8)
+    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+    m_id = rgb_to_id(np_seg_img)
+    area = [(m_id == i).sum() for i in range(n_classes)]
+    return area
+
+
+# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities
+def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+
+# Copied from transformers.models.detr.image_processing_detr.resize_annotation
+def resize_annotation(
+    annotation: Dict[str, Any],
+    orig_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`Dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`Tuple[int, int]`):
+            The original size of the input image.
+        target_size (`Tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+# Copied from transformers.models.detr.image_processing_detr.compute_segments
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[Set[int]] = None,
+    target_size: Tuple[int, int] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: List[Dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: Dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+@requires(backends=("vision",))
+class YolosImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Detr image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = None,
+        image_std: Union[float, List[float]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: bool = True,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_annotations",
+            "do_pad",
+            "pad_size",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    @classmethod
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->Yolos
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `YolosImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image,
+            size=new_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> Dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    def pad(
+        self,
+        images: List[np.ndarray],
+        annotations: Optional[List[Dict[str, Any]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[Dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            annotations (`List[Dict[str, any]]`, *optional*):
+                Annotations to pad along with the images. If provided, the bounding boxes will be updated to match the
+                padded images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`Dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`str` or `ChannelDimension`, *optional*, defaults to self.data_format):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`Dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in v4.33, "
+                "use `do_pad` instead.",
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in v4.33, use"
+                " `size['longest_edge']` instead.",
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # Here the pad() method pads using the max of (width, height) and does not need to be validated.
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS - TODO: add support for other frameworks
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process  with Detr->Yolos
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.post_process_object_detection with Detr->Yolos
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None
+    ):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+
+__all__ = ["YolosImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos_fast.py b/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..d048b3bb0e2621ca1f850bce79780af95a29d90f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/image_processing_yolos_fast.py
@@ -0,0 +1,893 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/yolos/modular_yolos.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_yolos.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from ...image_processing_utils import BatchFeature, get_size_dict
+from ...image_processing_utils_fast import (
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
+
+
+logger = logging.get_logger(__name__)
+
+
+class YolosFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    do_pad: Optional[bool]
+    pad_size: Optional[Dict[str, int]]
+    return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`List[List[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8, device=device)
+        mask = torch.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, axis=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/yolos/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by YOLOS.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float32, device=masks.device)
+    x = torch.arange(0, w, dtype=torch.float32, device=masks.device)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = torch.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * torch.unsqueeze(x, 0)
+    x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0]
+    x_min = (
+        torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    y_mask = masks * torch.unsqueeze(y, 0)
+    y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0]
+    y_min = (
+        torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0]
+    )
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, torch.Tensor) and len(color.shape) == 3:
+        if color.dtype == torch.uint8:
+            color = color.to(torch.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def prepare_coco_panoptic_annotation(
+    image: torch.Tensor,
+    target: Dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> Dict:
+    """
+    Prepare a coco panoptic annotation for YOLOS.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = torch.as_tensor(
+        [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device
+    )
+    new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+    new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device)
+
+    if "segments_info" in target:
+        masks = read_image(annotation_path).permute(1, 2, 0).to(dtype=torch.int32, device=image.device)
+        masks = rgb_to_id(masks)
+
+        ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device)
+        masks = masks == ids[:, None, None]
+        masks = masks.to(torch.bool)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = torch.as_tensor(
+            [segment_info["category_id"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["iscrowd"] = torch.as_tensor(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]],
+            dtype=torch.int64,
+            device=image.device,
+        )
+        new_target["area"] = torch.as_tensor(
+            [segment_info["area"] for segment_info in target["segments_info"]],
+            dtype=torch.float32,
+            device=image.device,
+        )
+
+    return new_target
+
+
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
+        oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
+
+
+@add_start_docstrings(
+    "Constructs a fast Yolos image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+    """,
+)
+@requires(backends=("torchvision", "torch"))
+class YolosImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size = {"shortest_edge": 800, "longest_edge": 1333}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = YolosFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[YolosFastImageProcessorKwargs]) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = kwargs.pop("size", None)
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        self.size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations", None)
+        do_normalize = kwargs.get("do_normalize", None)
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `YolosImageProcessorFast.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: Dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Dict:
+        """
+        Prepare an annotation for feeding into YOLOS model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: Dict[str, Any],
+        orig_size: Tuple[int, int],
+        target_size: Tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: "F.InterpolationMode" = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`Dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`Tuple[int, int]`):
+                The original size of the input image.
+            target_size (`Tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: Dict,
+        input_image_size: Tuple[int, int],
+        output_image_size: Tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> Dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: Tuple[int, int],
+        annotation: Optional[Dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the YOLOS model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`Dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+        return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+            Whether to return segmentation masks.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """,
+    )
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[YolosFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        if "pad_and_return_pixel_mask" in kwargs:
+            kwargs["do_pad"] = kwargs.pop("pad_and_return_pixel_mask")
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            kwargs["size"] = kwargs.pop("max_size")
+
+        return super().preprocess(images, annotations=annotations, masks_path=masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, List[AnnotationType]]],
+        return_segmentation_masks: bool,
+        masks_path: Optional[Union[str, pathlib.Path]],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        do_pad: bool,
+        pad_size: Optional[Dict[str, int]],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        data = {}
+
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> YOLOS target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size["height"], pad_size["width"])
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+
+__all__ = ["YolosImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/modeling_yolos.py b/docs/transformers/build/lib/transformers/models/yolos/modeling_yolos.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bebb37f76fdb33ba06322c2ffa2300e8d4fda3
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/modeling_yolos.py
@@ -0,0 +1,867 @@
+# coding=utf-8
+# Copyright 2022 School of EIC, Huazhong University of Science & Technology and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch YOLOS model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_yolos import YolosConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "YolosConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "hustvl/yolos-small"
+_EXPECTED_OUTPUT_SHAPE = [1, 3401, 384]
+
+
+@dataclass
+class YolosObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`YolosForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~YolosImageProcessor.post_process`] to retrieve the unnormalized bounding
+            boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[Dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[List[Dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class YolosEmbeddings(nn.Module):
+    """
+    Construct the CLS token, detection tokens, position and patch embeddings.
+
+    """
+
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.detection_tokens = nn.Parameter(torch.zeros(1, config.num_detection_tokens, config.hidden_size))
+        self.patch_embeddings = YolosPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, num_patches + config.num_detection_tokens + 1, config.hidden_size)
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.interpolation = InterpolateInitialPositionEmbeddings(config)
+        self.config = config
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values)
+
+        batch_size, seq_len, _ = embeddings.size()
+
+        # add the [CLS] and detection tokens to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        detection_tokens = self.detection_tokens.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings, detection_tokens), dim=1)
+
+        # add positional encoding to each token
+        # this might require interpolation of the existing position embeddings
+        position_embeddings = self.interpolation(self.position_embeddings, (height, width))
+
+        embeddings = embeddings + position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class InterpolateInitialPositionEmbeddings(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+
+    def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
+        cls_pos_embed = pos_embed[:, 0, :]
+        cls_pos_embed = cls_pos_embed[:, None]
+        det_pos_embed = pos_embed[:, -self.config.num_detection_tokens :, :]
+        patch_pos_embed = pos_embed[:, 1 : -self.config.num_detection_tokens, :]
+        patch_pos_embed = patch_pos_embed.transpose(1, 2)
+        batch_size, hidden_size, seq_len = patch_pos_embed.shape
+
+        patch_height, patch_width = (
+            self.config.image_size[0] // self.config.patch_size,
+            self.config.image_size[1] // self.config.patch_size,
+        )
+        patch_pos_embed = patch_pos_embed.view(batch_size, hidden_size, patch_height, patch_width)
+
+        height, width = img_size
+        new_patch_heigth, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed, size=(new_patch_heigth, new_patch_width), mode="bicubic", align_corners=False
+        )
+        patch_pos_embed = patch_pos_embed.flatten(2).transpose(1, 2)
+        scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=1)
+        return scale_pos_embed
+
+
+class InterpolateMidPositionEmbeddings(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+
+    def forward(self, pos_embed, img_size=(800, 1344)) -> torch.Tensor:
+        cls_pos_embed = pos_embed[:, :, 0, :]
+        cls_pos_embed = cls_pos_embed[:, None]
+        det_pos_embed = pos_embed[:, :, -self.config.num_detection_tokens :, :]
+        patch_pos_embed = pos_embed[:, :, 1 : -self.config.num_detection_tokens, :]
+        patch_pos_embed = patch_pos_embed.transpose(2, 3)
+        depth, batch_size, hidden_size, seq_len = patch_pos_embed.shape
+
+        patch_height, patch_width = (
+            self.config.image_size[0] // self.config.patch_size,
+            self.config.image_size[1] // self.config.patch_size,
+        )
+        patch_pos_embed = patch_pos_embed.view(depth * batch_size, hidden_size, patch_height, patch_width)
+        height, width = img_size
+        new_patch_height, new_patch_width = height // self.config.patch_size, width // self.config.patch_size
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed, size=(new_patch_height, new_patch_width), mode="bicubic", align_corners=False
+        )
+        patch_pos_embed = (
+            patch_pos_embed.flatten(2)
+            .transpose(1, 2)
+            .contiguous()
+            .view(depth, batch_size, new_patch_height * new_patch_width, hidden_size)
+        )
+        scale_pos_embed = torch.cat((cls_pos_embed, patch_pos_embed, det_pos_embed), dim=2)
+        return scale_pos_embed
+
+
+class YolosPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+# Copied from transformers.models.vit.modeling_vit.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Yolos
+class YolosSelfAttention(nn.Module):
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Yolos
+class YolosSelfOutput(nn.Module):
+    """
+    The residual connection is defined in YolosLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Yolos
+class YolosAttention(nn.Module):
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.attention = YolosSelfAttention(config)
+        self.output = YolosSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->Yolos
+class YolosIntermediate(nn.Module):
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->Yolos
+class YolosOutput(nn.Module):
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->Yolos,VIT->YOLOS
+class YolosLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = YolosAttention(config)
+        self.intermediate = YolosIntermediate(config)
+        self.output = YolosOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in Yolos, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in Yolos, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class YolosEncoder(nn.Module):
+    def __init__(self, config: YolosConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([YolosLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+        seq_length = (
+            1 + (config.image_size[0] * config.image_size[1] // config.patch_size**2) + config.num_detection_tokens
+        )
+        self.mid_position_embeddings = (
+            nn.Parameter(
+                torch.zeros(
+                    config.num_hidden_layers - 1,
+                    1,
+                    seq_length,
+                    config.hidden_size,
+                )
+            )
+            if config.use_mid_position_embeddings
+            else None
+        )
+
+        self.interpolation = InterpolateMidPositionEmbeddings(config) if config.use_mid_position_embeddings else None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        height,
+        width,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if self.config.use_mid_position_embeddings:
+            interpolated_mid_position_embeddings = self.interpolation(self.mid_position_embeddings, (height, width))
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.use_mid_position_embeddings:
+                if i < (self.config.num_hidden_layers - 1):
+                    hidden_states = hidden_states + interpolated_mid_position_embeddings[i]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class YolosPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = YolosConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = []
+    _supports_sdpa = True
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+YOLOS_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`YolosConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+YOLOS_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`YolosImageProcessor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare YOLOS Model transformer outputting raw hidden-states without any specific head on top.",
+    YOLOS_START_DOCSTRING,
+)
+class YolosModel(YolosPreTrainedModel):
+    def __init__(self, config: YolosConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = YolosEmbeddings(config)
+        self.encoder = YolosEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = YolosPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> YolosPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model.
+
+        Args:
+            heads_to_prune (`dict`):
+                See base class `PreTrainedModel`. The input dictionary must have the following format: {layer_num:
+                list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            height=pixel_values.shape[-2],
+            width=pixel_values.shape[-1],
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class YolosPooler(nn.Module):
+    def __init__(self, config: YolosConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead with Detr->Yolos
+class YolosMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    YOLOS Model (consisting of a ViT encoder) with object detection heads on top, for tasks such as COCO detection.
+    """,
+    YOLOS_START_DOCSTRING,
+)
+class YolosForObjectDetection(YolosPreTrainedModel):
+    def __init__(self, config: YolosConfig):
+        super().__init__(config)
+
+        # YOLOS (ViT) encoder model
+        self.vit = YolosModel(config, add_pooling_layer=False)
+
+        # Object detection heads
+        # We add one for the "no object" class
+        self.class_labels_classifier = YolosMLPPredictionHead(
+            input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=config.num_labels + 1, num_layers=3
+        )
+        self.bbox_predictor = YolosMLPPredictionHead(
+            input_dim=config.hidden_size, hidden_dim=config.hidden_size, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+    @add_start_docstrings_to_model_forward(YOLOS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=YolosObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[List[Dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, YolosObjectDetectionOutput]:
+        r"""
+        labels (`List[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: `'class_labels'` and `'boxes'` (the class labels and bounding boxes of an image in the
+            batch respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding
+            boxes in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image,
+            4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoModelForObjectDetection
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("hustvl/yolos-tiny")
+        >>> model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-tiny")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3]
+        Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36]
+        Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09]
+        Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67]
+        Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through YOLOS base model to obtain hidden states
+        outputs = self.vit(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # Take the final hidden states of the detection tokens
+        sequence_output = sequence_output[:, -self.config.num_detection_tokens :, :]
+
+        # Class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+        pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return YolosObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["YolosForObjectDetection", "YolosModel", "YolosPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/yolos/modular_yolos.py b/docs/transformers/build/lib/transformers/models/yolos/modular_yolos.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ffdbe2e2ba46d8fc64c9d73d8cb0bfa3340f21b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yolos/modular_yolos.py
@@ -0,0 +1,193 @@
+from typing import List, Optional, Tuple, Union
+
+from transformers.models.detr.image_processing_detr_fast import DetrImageProcessorFast
+
+from ...image_transforms import center_to_corners_format
+from ...utils import (
+    TensorType,
+    is_torch_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_size_with_aspect_ratio(
+    image_size: Tuple[int, int], size: int, max_size: Optional[int] = None, mod_size: int = 16
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size with multiple of divisible_size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        mod_size (`int`, *optional*):
+            The size to make multiple of mod_size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    elif (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    if mod_size is not None:
+        ow_mod = torch.remainder(torch.tensor(ow), mod_size).item()
+        oh_mod = torch.remainder(torch.tensor(oh), mod_size).item()
+        ow = ow - ow_mod
+        oh = oh - oh_mod
+
+    return (oh, ow)
+
+
+class YolosImageProcessorFast(DetrImageProcessorFast):
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
+    ):
+        """
+        Converts the raw output of [`YolosForObjectDetection`] into final bounding boxes in (top_left_x,
+        top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`YolosObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            top_k (`int`, *optional*, defaults to 100):
+                Keep only top k bounding boxes before filtering by thresholding.
+
+        Returns:
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+        prob = prob.view(out_logits.shape[0], -1)
+        k_value = min(top_k, prob.size(1))
+        topk_values, topk_indexes = torch.topk(prob, k_value, dim=1)
+        scores = topk_values
+        topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor")
+        labels = topk_indexes % out_logits.shape[2]
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, List):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    def post_process_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance():
+        raise NotImplementedError("Instance post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic():
+        raise NotImplementedError("Panoptic post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_instance_segmentation():
+        raise NotImplementedError("Segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_semantic_segmentation():
+        raise NotImplementedError("Semantic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+    def post_process_panoptic_segmentation():
+        raise NotImplementedError("Panoptic segmentation post-processing is not implemented for Deformable DETR yet.")
+
+
+__all__ = ["YolosImageProcessorFast"]
diff --git a/docs/transformers/build/lib/transformers/models/yoso/__init__.py b/docs/transformers/build/lib/transformers/models/yoso/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b865cb93ce1134a1a8761bafd1b3498931d7c83
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yoso/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_yoso import *
+    from .modeling_yoso import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/yoso/configuration_yoso.py b/docs/transformers/build/lib/transformers/models/yoso/configuration_yoso.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a7fb1218e402e36f6275f81783e00cc79c36a8d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yoso/configuration_yoso.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""YOSO model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class YosoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`YosoModel`]. It is used to instantiate an YOSO
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the YOSO
+    [uw-madison/yoso-4096](https://huggingface.co/uw-madison/yoso-4096) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the YOSO model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`YosoModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`YosoModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`.
+        use_expectation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use YOSO Expectation. Overrides any effect of num_hash.
+        hash_code_len (`int`, *optional*, defaults to 9):
+            The length of hashes generated by the hash functions.
+        num_hash (`int`, *optional*, defaults to 64):
+            Number of hash functions used in [`YosoSelfAttention`].
+        conv_window (`int`, *optional*):
+            Kernel size of depth-wise convolution.
+        use_fast_hash (`bool`, *optional*, defaults to `False`):
+            Whether or not to use custom cuda kernels which perform fast random projection via hadamard transform.
+        lsh_backward (`bool`, *optional*, defaults to `True`):
+            Whether or not to perform backpropagation using Locality Sensitive Hashing.
+
+    Example:
+
+    ```python
+    >>> from transformers import YosoConfig, YosoModel
+
+    >>> # Initializing a YOSO uw-madison/yoso-4096 style configuration
+    >>> configuration = YosoConfig()
+
+    >>> # Initializing a model (with random weights) from the uw-madison/yoso-4096 style configuration
+    >>> model = YosoModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "yoso"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=4096,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        position_embedding_type="absolute",
+        use_expectation=True,
+        hash_code_len=9,
+        num_hash=64,
+        conv_window=None,
+        use_fast_hash=True,
+        lsh_backward=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_expectation = use_expectation
+        self.hash_code_len = hash_code_len
+        self.num_hash = num_hash
+        self.conv_window = conv_window
+        self.use_fast_hash = use_fast_hash
+        self.lsh_backward = lsh_backward
+
+
+__all__ = ["YosoConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py b/docs/transformers/build/lib/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
new file mode 100644
index 0000000000000000000000000000000000000000..950769ae1e08e37936792e5599d54328835391d0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yoso/convert_yoso_pytorch_to_pytorch.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert YOSO checkpoints from the original repository. URL: https://github.com/mlpen/YOSO"""
+
+import argparse
+
+import torch
+
+from transformers import YosoConfig, YosoForMaskedLM
+
+
+def rename_key(orig_key):
+    if "model" in orig_key:
+        orig_key = orig_key.replace("model.", "")
+    if "norm1" in orig_key:
+        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
+    if "norm2" in orig_key:
+        orig_key = orig_key.replace("norm2", "output.LayerNorm")
+    if "norm" in orig_key:
+        orig_key = orig_key.replace("norm", "LayerNorm")
+    if "transformer" in orig_key:
+        layer_num = orig_key.split(".")[0].split("_")[-1]
+        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
+    if "mha.attn" in orig_key:
+        orig_key = orig_key.replace("mha.attn", "attention.self")
+    if "mha" in orig_key:
+        orig_key = orig_key.replace("mha", "attention")
+    if "W_q" in orig_key:
+        orig_key = orig_key.replace("W_q", "self.query")
+    if "W_k" in orig_key:
+        orig_key = orig_key.replace("W_k", "self.key")
+    if "W_v" in orig_key:
+        orig_key = orig_key.replace("W_v", "self.value")
+    if "ff1" in orig_key:
+        orig_key = orig_key.replace("ff1", "intermediate.dense")
+    if "ff2" in orig_key:
+        orig_key = orig_key.replace("ff2", "output.dense")
+    if "ff" in orig_key:
+        orig_key = orig_key.replace("ff", "output.dense")
+    if "mlm_class" in orig_key:
+        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
+    if "mlm" in orig_key:
+        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
+    if "cls" not in orig_key:
+        orig_key = "yoso." + orig_key
+
+    return orig_key
+
+
+def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if ("pooler" in key) or ("sen_class" in key):
+            continue
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
+    orig_state_dict["yoso.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2
+
+    return orig_state_dict
+
+
+def convert_yoso_checkpoint(checkpoint_path, yoso_config_file, pytorch_dump_path):
+    orig_state_dict = torch.load(checkpoint_path, map_location="cpu", weights_only=True)["model_state_dict"]
+    config = YosoConfig.from_json_file(yoso_config_file)
+    model = YosoForMaskedLM(config)
+
+    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)
+
+    print(model.load_state_dict(new_state_dict))
+    model.eval()
+    model.save_pretrained(pytorch_dump_path)
+
+    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--pytorch_model_path", default=None, type=str, required=True, help="Path to YOSO pytorch checkpoint."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The json file for YOSO model config.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    args = parser.parse_args()
+    convert_yoso_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)
diff --git a/docs/transformers/build/lib/transformers/models/yoso/modeling_yoso.py b/docs/transformers/build/lib/transformers/models/yoso/modeling_yoso.py
new file mode 100644
index 0000000000000000000000000000000000000000..edccabee2ea42d004e4b0eaefce87684e8f671c6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/yoso/modeling_yoso.py
@@ -0,0 +1,1320 @@
+# coding=utf-8
+# Copyright 2022 University of Wisconsin-Madison and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch YOSO model."""
+
+import math
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_ninja_available,
+    is_torch_cuda_available,
+    logging,
+)
+from .configuration_yoso import YosoConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "uw-madison/yoso-4096"
+_CONFIG_FOR_DOC = "YosoConfig"
+
+
+lsh_cumulation = None
+
+
+def load_cuda_kernels():
+    global lsh_cumulation
+    from torch.utils.cpp_extension import load
+
+    def append_root(files):
+        src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "yoso"
+        return [src_folder / file for file in files]
+
+    src_files = append_root(["fast_lsh_cumulation_torch.cpp", "fast_lsh_cumulation.cu", "fast_lsh_cumulation_cuda.cu"])
+
+    load("fast_lsh_cumulation", src_files, verbose=True)
+
+    import fast_lsh_cumulation as lsh_cumulation
+
+
+def to_contiguous(input_tensors):
+    if isinstance(input_tensors, list):
+        out = []
+        for tensor in input_tensors:
+            if not tensor.is_contiguous():
+                tensor = tensor.contiguous()
+            out.append(tensor)
+        return out
+    else:
+        if not input_tensors.is_contiguous():
+            input_tensors = input_tensors.contiguous()
+        return input_tensors
+
+
+def normalize(input_tensors):
+    if isinstance(input_tensors, list):
+        out = []
+        for tensor in input_tensors:
+            out.append(nn.functional.normalize(tensor, p=2, dim=-1))
+        return out
+    else:
+        return nn.functional.normalize(input_tensors, p=2, dim=-1)
+
+
+def hashing(query, key, num_hash, hash_len):
+    if len(query.size()) != 3:
+        raise ValueError("Query has incorrect size.")
+    if len(key.size()) != 3:
+        raise ValueError("Key has incorrect size.")
+
+    rmat = torch.randn(query.size(0), query.size(2), num_hash * hash_len, device=query.device)
+    raise_pow = 2 ** torch.arange(hash_len, device=query.device)
+
+    query_projection = torch.matmul(query, rmat).reshape(query.size(0), query.size(1), num_hash, hash_len)
+    key_projection = torch.matmul(key, rmat).reshape(key.size(0), key.size(1), num_hash, hash_len)
+    query_binary = (query_projection > 0).int()
+    key_binary = (key_projection > 0).int()
+    query_hash = torch.sum(query_binary * raise_pow, dim=-1)
+    query_hash = torch.sum(key_binary * raise_pow, dim=-1)
+
+    return query_hash.int(), query_hash.int()
+
+
+class YosoCumulation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, query_mask, key_mask, query, key, value, config):
+        hash_code_len = config["hash_code_len"]
+
+        expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
+        expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
+        cumulation_value = torch.matmul(expectation, value)
+
+        ctx.save_for_backward(query_mask, key_mask, expectation, query, key, value)
+        ctx.config = config
+
+        return cumulation_value
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = to_contiguous(grad)
+
+        query_mask, key_mask, expectation, query, key, value = ctx.saved_tensors
+        config = ctx.config
+
+        hash_code_len = config["hash_code_len"]
+
+        weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
+        grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
+        grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
+        grad_value = torch.matmul(expectation.transpose(-1, -2), grad)
+
+        return None, None, grad_query, grad_key, grad_value, None
+
+
+class YosoLSHCumulation(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, query_mask, key_mask, query, key, value, config):
+        if query_mask.size(0) != key_mask.size(0):
+            raise ValueError("Query mask and Key mask differ in sizes in dimension 0")
+        if query_mask.size(0) != query.size(0):
+            raise ValueError("Query mask and Query differ in sizes in dimension 0")
+        if query_mask.size(0) != key.size(0):
+            raise ValueError("Query mask and Key differ in sizes in dimension 0")
+        if query_mask.size(0) != value.size(0):
+            raise ValueError("Query mask and Value mask differ in sizes in dimension 0")
+        if key.size(1) != value.size(1):
+            raise ValueError("Key and Value differ in sizes in dimension 1")
+        if query.size(2) != key.size(2):
+            raise ValueError("Query and Key differ in sizes in dimension 2")
+
+        query_mask, key_mask, query, key, value = to_contiguous([query_mask, key_mask, query, key, value])
+
+        use_cuda = query_mask.is_cuda
+        num_hash = config["num_hash"]
+        hash_code_len = config["hash_code_len"]
+        hashtable_capacity = int(2**hash_code_len)
+
+        if config["use_fast_hash"]:
+            query_hash_code, key_hash_code = lsh_cumulation.fast_hash(
+                query_mask, query, key_mask, key, num_hash, hash_code_len, use_cuda, 1
+            )
+        else:
+            query_hash_code, key_hash_code = hashing(query, key, num_hash, hash_code_len)
+
+        cumulation_value = lsh_cumulation.lsh_cumulation(
+            query_mask, query_hash_code, key_mask, key_hash_code, value, hashtable_capacity, use_cuda, 1
+        )
+
+        ctx.save_for_backward(query_mask, key_mask, query_hash_code, key_hash_code, query, key, value)
+        ctx.config = config
+
+        return cumulation_value
+
+    @staticmethod
+    def backward(ctx, grad):
+        grad = to_contiguous(grad)
+
+        query_mask, key_mask, query_hash_code, key_hash_code, query, key, value = ctx.saved_tensors
+        config = ctx.config
+
+        use_cuda = grad.is_cuda
+        hash_code_len = config["hash_code_len"]
+        hashtable_capacity = int(2**hash_code_len)
+
+        if config["lsh_backward"]:
+            grad_value = lsh_cumulation.lsh_cumulation(
+                key_mask, key_hash_code, query_mask, query_hash_code, grad, hashtable_capacity, use_cuda, 1
+            )
+            grad_query = lsh_cumulation.lsh_weighted_cumulation(
+                query_mask,
+                query_hash_code,
+                grad,
+                key_mask,
+                key_hash_code,
+                value,
+                (hash_code_len / 2) * key,
+                hashtable_capacity,
+                use_cuda,
+                4,
+            )
+            grad_key = lsh_cumulation.lsh_weighted_cumulation(
+                key_mask,
+                key_hash_code,
+                value,
+                query_mask,
+                query_hash_code,
+                grad,
+                (hash_code_len / 2) * query,
+                hashtable_capacity,
+                use_cuda,
+                4,
+            )
+        else:
+            expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
+            expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
+            weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
+            grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
+            grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
+            grad_value = torch.matmul(expectation.transpose(-1, -2), grad)
+
+        return None, None, grad_query, grad_key, grad_value, None
+
+
+# Copied from transformers.models.nystromformer.modeling_nystromformer.NystromformerEmbeddings
+class YosoEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class YosoSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        kernel_loaded = lsh_cumulation is not None
+        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
+            try:
+                load_cuda_kernels()
+            except Exception as e:
+                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = (
+            position_embedding_type if position_embedding_type is not None else config.position_embedding_type
+        )
+
+        self.use_expectation = config.use_expectation
+        self.hash_code_len = config.hash_code_len
+        self.use_conv = config.conv_window is not None
+        self.use_fast_hash = config.use_fast_hash
+        self.num_hash = config.num_hash
+        self.lsh_backward = config.lsh_backward
+
+        self.lsh_config = {
+            "hash_code_len": self.hash_code_len,
+            "use_fast_hash": self.use_fast_hash,
+            "num_hash": self.num_hash,
+            "lsh_backward": self.lsh_backward,
+        }
+
+        if config.conv_window is not None:
+            self.conv = nn.Conv2d(
+                in_channels=config.num_attention_heads,
+                out_channels=config.num_attention_heads,
+                kernel_size=(config.conv_window, 1),
+                padding=(config.conv_window // 2, 0),
+                bias=False,
+                groups=config.num_attention_heads,
+            )
+
+    def transpose_for_scores(self, layer):
+        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        layer = layer.view(*new_layer_shape)
+        return layer.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.use_conv:
+            conv_value_layer = self.conv(value_layer * attention_mask[:, None, :, None])
+
+        batch_size, num_heads, seq_len, head_dim = query_layer.size()
+
+        query_layer = query_layer.reshape(batch_size * num_heads, seq_len, head_dim)
+        key_layer = key_layer.reshape(batch_size * num_heads, seq_len, head_dim)
+        value_layer = value_layer.reshape(batch_size * num_heads, seq_len, head_dim)
+
+        attention_mask = 1.0 + attention_mask / 10000.0
+        attention_mask = (
+            attention_mask.unsqueeze(1)
+            .repeat_interleave(num_heads, dim=1)
+            .reshape(batch_size * num_heads, seq_len)
+            .int()
+        )
+
+        # The CUDA kernels are most efficient with inputs whose size is a multiple of a GPU's warp size (32). Inputs
+        # smaller than this are padded with zeros.
+        gpu_warp_size = 32
+
+        if (not self.use_expectation) and head_dim < gpu_warp_size:
+            pad_size = batch_size * num_heads, seq_len, gpu_warp_size - head_dim
+
+            query_layer = torch.cat(
+                [
+                    query_layer,
+                    torch.zeros(pad_size, device=query_layer.device),
+                ],
+                dim=-1,
+            )
+            key_layer = torch.cat(
+                [
+                    key_layer,
+                    torch.zeros(pad_size, device=key_layer.device),
+                ],
+                dim=-1,
+            )
+            value_layer = torch.cat(
+                [
+                    value_layer,
+                    torch.zeros(pad_size, device=value_layer.device),
+                ],
+                dim=-1,
+            )
+
+        if self.use_expectation or self.training:
+            query_layer, key_layer = normalize([query_layer, key_layer])
+
+        if self.use_expectation:
+            context_layer = YosoCumulation.apply(
+                attention_mask, attention_mask, query_layer, key_layer, value_layer, self.lsh_config
+            )
+        else:
+            context_layer = YosoLSHCumulation.apply(
+                attention_mask, attention_mask, query_layer, key_layer, value_layer, self.lsh_config
+            )
+
+        if (not self.use_expectation) and head_dim < gpu_warp_size:
+            context_layer = context_layer[:, :, :head_dim]
+
+        context_layer = normalize(context_layer)
+
+        context_layer = context_layer.reshape(batch_size, num_heads, seq_len, head_dim)
+
+        if self.use_conv:
+            context_layer += conv_value_layer
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, context_layer) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class YosoSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class YosoAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = YosoSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = YosoSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class YosoIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class YosoOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class YosoLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = YosoAttention(config)
+        self.add_cross_attention = config.add_cross_attention
+        self.intermediate = YosoIntermediate(config)
+        self.output = YosoOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class YosoEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([YosoLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform
+class YosoPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Yoso
+class YosoLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = YosoPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Yoso
+class YosoOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = YosoLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class YosoPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = YosoConfig
+    base_model_prefix = "yoso"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+YOSO_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`YosoConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+YOSO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare YOSO Model transformer outputting raw hidden-states without any specific head on top.",
+    YOSO_START_DOCSTRING,
+)
+class YosoModel(YosoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = YosoEmbeddings(config)
+        self.encoder = YosoEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING)
+class YosoForMaskedLM(YosoPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.yoso = YosoModel(config)
+        self.cls = YosoOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.yoso(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class YosoClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    YOSO_START_DOCSTRING,
+)
+class YosoForSequenceClassification(YosoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.yoso = YosoModel(config)
+        self.classifier = YosoClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.yoso(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """YOSO Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    YOSO_START_DOCSTRING,
+)
+class YosoForMultipleChoice(YosoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.yoso = YosoModel(config)
+        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.yoso(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
+        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """YOSO Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    YOSO_START_DOCSTRING,
+)
+class YosoForTokenClassification(YosoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.yoso = YosoModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.yoso(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """YOSO Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    YOSO_START_DOCSTRING,
+)
+class YosoForQuestionAnswering(YosoPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.yoso = YosoModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.yoso(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "YosoForMaskedLM",
+    "YosoForMultipleChoice",
+    "YosoForQuestionAnswering",
+    "YosoForSequenceClassification",
+    "YosoForTokenClassification",
+    "YosoLayer",
+    "YosoModel",
+    "YosoPreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/zamba/__init__.py b/docs/transformers/build/lib/transformers/models/zamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a233755de26862db8489d083ffc7f2809e821d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zamba import *
+    from .modeling_zamba import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/zamba/configuration_zamba.py b/docs/transformers/build/lib/transformers/models/zamba/configuration_zamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d99a322720518b71587c7f178f2c14b1ed350a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba/configuration_zamba.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Zamba model configuration"""
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ZambaModel`]. It is used to instantiate a
+    Zamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Zamba-v0.1 model.
+
+    [Zyphra/Zamba-7B-v1](https://huggingface.co/Zyphra/Zamba-7B-v1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Zamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ZambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 3712):
+            Dimension of the hidden representations.
+        attention_hidden_size (`int`, *optional*):
+            Dimension of the hidden representations of the inputs to the Attention layer.
+        intermediate_size (`int`, *optional*, defaults to 14848):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 76):
+            Number of hidden layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_head_dim (`int`, *optional*):
+            Dimension of the attention head in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf).
+        n_mamba_heads (`int`, *optional*, defaults to 2):
+            Number of mamba heads for each mamba layer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder.
+        hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the mamba layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attn_layer_period (`int`, *optional*, defaults to 6):
+            Once in this many layers, we will have a shared attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            Offset of the shared attention layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj_bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj_bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "zamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        tie_word_embeddings=True,
+        hidden_size=3712,
+        attention_hidden_size=None,
+        intermediate_size=14848,
+        num_hidden_layers=76,
+        num_attention_heads=16,
+        attention_head_dim=None,
+        num_key_value_heads=16,
+        n_mamba_heads=2,
+        hidden_act="gelu",
+        hidden_mamba_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        attn_layer_period=6,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        if attention_hidden_size is None:
+            self.attention_hidden_size = 2 * hidden_size
+        else:
+            self.attention_hidden_size = attention_hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if attention_head_dim is None:
+            self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
+        else:
+            self.attention_head_dim = attention_head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        self.num_key_value_heads = num_key_value_heads
+        self.n_mamba_heads = n_mamba_heads
+        self.hidden_act = hidden_act
+        self.hidden_mamba_act = hidden_mamba_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
+
+        assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, (
+            "`intermediate_size` should be divisible by `n_mamba_heads`."
+        )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset):
+        layers = [
+            "mamba",
+            "mamba",
+            "hybrid",
+        ] + ["hybrid" if i % attn_layer_period == attn_layer_offset else "mamba" for i in range(num_hidden_layers - 3)]
+        return layers
+
+
+__all__ = ["ZambaConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/zamba/modeling_zamba.py b/docs/transformers/build/lib/transformers/models/zamba/modeling_zamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c54fbdf48ce5f73538e4d15e1b77fbc1bb72b3
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba/modeling_zamba.py
@@ -0,0 +1,1489 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Zamba model."""
+
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.import_utils import (
+    is_causal_conv1d_available,
+    is_mamba_ssm_available,
+)
+from .configuration_zamba import ZambaConfig
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "ZambaConfig"
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Zamba
+class ZambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ZambaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+ALL_LAYERNORM_LAYERS.append(ZambaRMSNorm)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ZambaHybridDynamicCache(DynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states += [
+                torch.zeros(batch_size, self.intermediate_size, self.conv_kernel_size, device=device, dtype=dtype)
+            ]
+            cache_shape = (
+                batch_size,
+                self.n_mamba_heads,
+                self.intermediate_size // self.n_mamba_heads,
+                self.ssm_state_size,
+            )
+            self.ssm_states += [torch.zeros(cache_shape, device=device, dtype=dtype)]
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.update
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.reorder_cache
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.get_seq_length
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("ZambaHybridDynamicCache does not have a legacy cache equivalent.")
+
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("ZambaHybridDynamicCache does not have a legacy cache equivalent.")
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class ZambaAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.head_dim = config.attention_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.scaling = (self.head_dim / 2) ** -0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class ZambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+
+    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
+    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
+    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
+    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.n_mamba_heads = config.n_mamba_heads
+        self.mamba_head_dim = self.intermediate_size // self.n_mamba_heads
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=self.use_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.intermediate_size,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.activation = config.hidden_mamba_act
+        self.act = ACT2FN[config.hidden_mamba_act]
+
+        self.use_fast_kernels = config.use_mamba_kernels
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias)
+        # weight associated to the selective projection used to make dt, B and C input dependent
+        # each mamba head is processed independently
+        self.x_proj_weight = nn.Parameter(
+            (
+                torch.zeros(
+                    self.n_mamba_heads,
+                    self.time_step_rank + self.ssm_state_size * 2,
+                    self.mamba_head_dim,
+                )
+            )
+        )
+        # time step projection (discretization)
+        self.dt_proj_weight = nn.Parameter(
+            (torch.zeros(self.n_mamba_heads, self.mamba_head_dim, self.time_step_rank) - 0.5)
+            * 2
+            / self.time_step_rank**0.5
+        )
+        self.dt_proj_bias = nn.Parameter(torch.zeros(self.n_mamba_heads, self.mamba_head_dim))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+        self.A_log = nn.Parameter(torch.log(A).reshape(self.n_mamba_heads, self.mamba_head_dim, -1))
+        self.D = nn.Parameter(torch.ones(self.n_mamba_heads, self.mamba_head_dim))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
+            )
+
+    def cuda_kernels_forward(
+        self, hidden_states: torch.Tensor, cache_params: ZambaHybridDynamicCache = None, attention_mask=None
+    ):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = cache_params is not None and cache_params.has_previous_state and seq_len == 1
+
+        # 1. Gated linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            if cache_params is not None:
+                conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. SSM sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        discrete_time_step = self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)
+
+        A = -torch.exp(self.A_log.float())
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = self.dt_proj_bias.float() if self.dt_proj_bias is not None else None
+        scan_outputs = torch.empty((batch_size, 0, seq_len), device=hidden_states.device, dtype=hidden_states.dtype)
+
+        if use_precomputed_states:
+            for n in range(self.n_mamba_heads):
+                scan_outputs_ = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx][:, n],
+                    hidden_states[n, ..., 0],
+                    discrete_time_step[n, ..., 0],
+                    A[n],
+                    B[n, :, 0],
+                    C[n, :, 0],
+                    self.D[n],
+                    gate[n, ..., 0],
+                    time_proj_bias[n],
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1)
+
+        else:
+            ssm_state = torch.empty(
+                (batch_size, 0, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+            for n in range(self.n_mamba_heads):
+                scan_outputs_, ssm_state_ = selective_scan_fn(
+                    hidden_states[n],
+                    discrete_time_step[n],
+                    A[n],
+                    B[n].transpose(1, 2),
+                    C[n].transpose(1, 2),
+                    self.D[n].float(),
+                    gate[n],
+                    time_proj_bias[n],
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1).contiguous()
+                ssm_state = torch.cat((ssm_state, ssm_state_.unsqueeze(1)), dim=1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+
+    def slow_forward(self, input_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        use_cache = isinstance(cache_params, ZambaHybridDynamicCache)
+        # 2. Convolution sequence transformation
+        if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
+            if self.training:
+                # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
+                ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            else:
+                ssm_state = cache_params.ssm_states[self.layer_idx]
+
+            ssm_state = ssm_state.to(hidden_states.device)
+
+            if (
+                cache_params.has_previous_state
+                and seq_len == 1
+                and cache_params.conv_states[self.layer_idx].shape[0] == batch_size
+            ):
+                conv_state = cache_params.conv_states[self.layer_idx]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)
+            else:
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.n_mamba_heads, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=dtype,
+            )
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        discrete_time_step = (self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)) + self.dt_proj_bias[
+            :, None, :, None
+        ]
+
+        discrete_time_step = nn.functional.softplus(discrete_time_step)
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())
+        discrete_A = torch.exp(A[:, None, :, None, :] * discrete_time_step[:, :, :, :, None])
+        discrete_B = discrete_time_step[:, :, :, :, None] * B[:, :, None, :, :].float()
+        deltaB_u = discrete_B * hidden_states[:, :, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, :, i, :].transpose(0, 1) * ssm_state + deltaB_u[:, :, :, i, :].transpose(0, 1)
+            scan_output = torch.matmul(ssm_state.transpose(0, 1).to(dtype), C[:, :, i, :].unsqueeze(-1))
+            scan_outputs.append(scan_output[:, :, :, 0])
+        scan_output = torch.stack(scan_outputs, dim=-1)
+        scan_output = scan_output + (hidden_states * self.D[:, None, :, None])
+        scan_output = scan_output * self.act(gate)
+
+        if use_cache:
+            cache_params.ssm_states[self.layer_idx] = ssm_state
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(
+            scan_output.transpose(0, 1).reshape(batch_size, -1, seq_len).transpose(1, 2)
+        )
+        return contextualized_states
+
+    def forward(self, hidden_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
+        if self.use_fast_kernels:
+            if not is_fast_path_available or "cuda" not in self.x_proj_weight.device.type:
+                raise ValueError(
+                    "Fast Mamba kernels are not available. Make sure to they are installed and that "
+                    "the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' "
+                    "and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config."
+                )
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask=attention_mask)
+        return self.slow_forward(hidden_states, cache_params, attention_mask=attention_mask)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Zamba
+class ZambaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class ZambaAttentionDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.self_attn = ZambaAttention(config, layer_idx)
+
+        self.feed_forward = ZambaMLP(config)
+        self.input_layernorm = ZambaRMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # feed-forward (MLP)
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class ZambaMambaDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: int):
+        super().__init__()
+        self.mamba = ZambaMambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://arxiv.org/pdf/2405.16712).
+        hidden_states = (
+            hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_value,
+            attention_mask=attention_mask,
+        )
+
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        return outputs
+
+
+class ZambaHybridLayer(nn.Module):
+    def __init__(self, shared_transf: ZambaAttentionDecoderLayer, linear: nn.Linear, mamba: ZambaMambaDecoderLayer):
+        super().__init__()
+        self.shared_transf = shared_transf
+        self.linear = linear
+        self.mamba_decoder = mamba
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        layer_outputs = self.shared_transf(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+ZAMBA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`ZambaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Zamba Model outputting raw hidden-states without any specific head on top.",
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaPreTrainedModel(PreTrainedModel):
+    config_class = ZambaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ZambaAttentionDecoderLayer", "ZambaMambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_sdpa = False
+    _supports_cache_class = True  # Note: only supports ZambaHybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, ZambaRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ZambaMambaMixer):
+            module.x_proj_weight.data.normal_(mean=0.0, std=std)
+            dt_init_std = self.config.mamba_dt_rank**-0.5
+            nn.init.uniform_(module.dt_proj_weight, -dt_init_std, dt_init_std)
+
+            mamba_head_dim = self.config.mamba_expand * self.config.hidden_size // self.config.n_mamba_heads
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads, mamba_head_dim)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_proj_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
+            A = A.expand(module.intermediate_size, -1).contiguous()
+            module.A_log.data.copy_(torch.log(A).reshape(module.n_mamba_heads, module.mamba_head_dim, -1))
+            module.D.data.fill_(1.0)
+
+    @classmethod
+    @classmethod
+    def _check_and_enable_flash_attn_2(
+        cls,
+        config,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        hard_check_only: bool = False,
+        check_device_map: bool = False,
+    ):
+        """
+        Overloads `PreTrainedModel._check_and_enable_flash_attn_2` so as to DISABLE Flash Attention 2 by default on Zamba models.
+        Flash attention 2 is currently not supported in the HuggingFace implementation of Zamba v1.
+        """
+        config = super()._check_and_enable_flash_attn_2(
+            config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map
+        )
+
+        # if using the default path -> swap sdpa by eager
+        if not hard_check_only and config._attn_implementation == "flash_attention_2":
+            config._attn_implementation = "eager"
+
+        return config
+
+
+ZAMBA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`ZambaHybridDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            A ZambaHybridDynamicCache object containing pre-computed hidden-states (keys and values in the
+            self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+            Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
+            Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
+            `(batch_size, d_inner, d_state)` respectively.
+            See the `ZambaHybridDynamicCache` class for more details.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Zamba Model outputting raw hidden-states without any specific head on top.",
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaModel(ZambaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]
+
+    Args:
+        config: ZambaConfig
+    """
+
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        block = ZambaAttentionDecoderLayer(config)
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        layers = []
+        self._tied_weights_keys = []
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                prefix_name = f"layers.{layer_id}."
+                tied_keys = [
+                    "shared_transf.self_attn.q_proj.weight",
+                    "shared_transf.self_attn.k_proj.weight",
+                    "shared_transf.self_attn.v_proj.weight",
+                    "shared_transf.self_attn.o_proj.weight",
+                    "shared_transf.feed_forward.gate_proj.weight",
+                    "shared_transf.feed_forward.up_proj.weight",
+                    "shared_transf.feed_forward.down_proj.weight",
+                    "shared_transf.input_layernorm.weight",
+                    "shared_transf.pre_ff_layernorm.weight",
+                ]
+                self._tied_weights_keys = [*self._tied_weights_keys, *[prefix_name + key for key in tied_keys]]
+                layers.append(ZambaHybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba, JAMBA->ZAMBA
+class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys]
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ZambaForCausalLM
+
+        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwitten -- has a unique cache type, `ZambaHybridDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+            # Exception 1: when passing input_embeds, input_ids may be missing entries
+            # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+            # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+            #              (we can't check exception 3 while compiling)
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = ZambaHybridDynamicCache(
+                self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The Zamba Model with a sequence classification head on top (linear layer).
+
+    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    ZAMBA_START_DOCSTRING,
+)
+class ZambaForSequenceClassification(ZambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = self.model._tied_weights_keys
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["ZambaForCausalLM", "ZambaForSequenceClassification", "ZambaModel", "ZambaPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/zamba2/__init__.py b/docs/transformers/build/lib/transformers/models/zamba2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00db458c72ebd5513d8e8cf1b186f49886745a2a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zamba2 import *
+    from .modeling_zamba2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/zamba2/configuration_zamba2.py b/docs/transformers/build/lib/transformers/models/zamba2/configuration_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda81abd430cfe2c3c12a8865c450e64f5f53a44
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba2/configuration_zamba2.py
@@ -0,0 +1,240 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/zamba2/modular_zamba2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_zamba2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Zamba2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Zamba2Model`]. It is used to instantiate a
+    Zamba2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Zamba2 model.
+
+    [Zyphra/Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Zamba2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Zamba2Model`]
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 54):
+            Number of hidden layers in the model.
+        layers_block_type (`list`, *optional*):
+            List of layer types, which can be either "mamba" or "hybrid".
+        mamba_d_state (`int`, *optional*, defaults to 64): shape of the state space latents.
+        mamba_d_conv (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        mamba_ngroups (`int`, *optional*, defaults to 1):
+            Number of groups for the evolution matrices of mamba 2.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        time_step_limit (`tuple`, *optional*):
+            Accepted range of time step values.
+        n_mamba_heads (`int`, *optional*, defaults to 8):
+            Number of heads for the evolution matrices of mamba 2.
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        chunk_size (`int`, *optional*, defaults to 256):
+            Size of the chunks that will comprise the sequence.
+        use_mem_eff_path (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the fused conv1d and scan in mamba2 layers.
+        add_bias_linear (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in various layers
+        intermediate_size (`int`, *optional*, defaults to 4 * hidden_size):
+            Dimension of the MLP representations.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the MLP.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf).
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_mem_blocks (`int`, *optional*, defaults to 1):
+            Number of unshared transformer blocks.
+        use_shared_attention_adapter (`bool`, *optional*, defaults to `False`):
+            If True, unshared adapters (formally the same as LoRA but used in the base model) will be added to the q, k, v projectors in the shared attention layers.
+        adapter_rank (`int`, *optional*, defaults to 128):
+            Rank of the adapter in the shared MLP and shared attention layers.
+        use_mem_rope (`bool`, *optional*, defaults to `False`):
+            If True, includes RoPE in the shared attention layers.
+        rope_theta (`float`, *optional*, defaults to `10000.0`):
+            The base period of the RoPE embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        use_long_context (`bool`, *optional*, defaults to `False`):
+            Activates the context-extended version of Zamba by modifying RoPE.
+    ```python
+    >>> from transformers import Zamba2Model, Zamba2Config
+    >>> # Initializing a Zamba2-2.7B style configuration
+    >>> configuration = Zamba2Config()
+    >>> # Initializing a model from the Zamba2-2.7B style configuration
+    >>> model = Zamba2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "zamba2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        max_position_embeddings=4096,
+        hidden_size=2560,
+        num_hidden_layers=54,
+        layers_block_type=None,
+        mamba_d_state=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_ngroups=1,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        time_step_limit=None,
+        n_mamba_heads=8,
+        use_conv_bias=True,
+        chunk_size=256,
+        use_mem_eff_path=False,
+        add_bias_linear=False,
+        intermediate_size=None,
+        hidden_act="gelu",
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        attention_dropout=0.0,
+        num_mem_blocks=1,
+        use_shared_attention_adapter=False,
+        adapter_rank=128,
+        use_mem_rope=False,
+        rope_theta=10000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        use_long_context=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        if intermediate_size is None:
+            self.intermediate_size = 4 * hidden_size
+        else:
+            self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_mem_blocks = num_mem_blocks
+        self.attention_hidden_size = 2 * hidden_size
+        self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.use_mem_rope = use_mem_rope
+        self.use_long_context = use_long_context
+        if use_mem_rope and use_long_context:
+            a = 8
+            rope_theta = rope_theta * a ** (self.attention_head_dim / (self.attention_head_dim - 2))
+        self.rope_theta = rope_theta
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.add_bias_linear = add_bias_linear
+        self.mamba_ngroups = mamba_ngroups
+        self.n_mamba_heads = n_mamba_heads
+        self.mamba_headdim = int(mamba_expand * hidden_size) // n_mamba_heads
+        self.use_conv_bias = use_conv_bias
+        self.chunk_size = chunk_size
+        self.time_step_limit = time_step_limit
+        self.use_shared_attention_adapter = use_shared_attention_adapter
+        self.adapter_rank = adapter_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        if use_long_context:
+            self.max_position_embeddings = 16384
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_attention_heads = num_attention_heads
+        self.kv_channels = self.hidden_size // self.num_attention_heads
+        self.num_query_groups = self.num_attention_heads
+        # Below, "mamba" stands for mamba layer, "hybrid" stands for hybrid layer (composed by a shared transformer followed by mamba layer)
+        if layers_block_type is None:
+            self.layers_block_type = (
+                ["mamba"]
+                + (["mamba"] * 5 + ["hybrid"]) * 7
+                + ["mamba"] * 4
+                + ["hybrid"]
+                + ["mamba"] * 3
+                + ["hybrid"]
+                + ["mamba"] * 2
+            )
+        else:
+            self.layers_block_type = layers_block_type
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"]
+        self.use_mem_eff_path = use_mem_eff_path
+
+
+__all__ = ["Zamba2Config"]
diff --git a/docs/transformers/build/lib/transformers/models/zamba2/modeling_zamba2.py b/docs/transformers/build/lib/transformers/models/zamba2/modeling_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad6ca0f872e44e88c3f93ad54467e7a5319c663e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba2/modeling_zamba2.py
@@ -0,0 +1,1898 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/zamba2/modular_zamba2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_zamba2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import re
+from itertools import cycle
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
+from .configuration_zamba2 import Zamba2Config
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "Zyphra/Zamba2-2.7B"
+
+
+class Zamba2RMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, group_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        *prefix_dims, last_dim = hidden_states.shape
+        group_count = last_dim // self.group_size
+        hidden_states_group = hidden_states.view(*prefix_dims, group_count, self.group_size)
+        variance = hidden_states_group.pow(2).mean(-1, keepdim=True)
+        hidden_states_group = hidden_states_group * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states_group.view(*prefix_dims, group_count * self.group_size)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Zamba2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Zamba2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Zamba2HybridDynamicCache(DynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(
+        self, config: Zamba2Config, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None
+    ):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False
+        self.intermediate_size = int(config.mamba_expand * config.hidden_size)
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        self.conv_states = {}
+        self.ssm_states = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states[i] = torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_ngroups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=device,
+                dtype=dtype,
+            )
+            self.ssm_states[i] = torch.zeros(
+                batch_size, self.n_mamba_heads, config.mamba_headdim, self.ssm_state_size, device=device, dtype=dtype
+            )
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("Zamba2HybridDynamicCache does not have a legacy cache equivalent.")
+
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("Zamba2HybridDynamicCache does not have a legacy cache equivalent.")
+
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+
+class Zamba2RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        config: Zamba2Config,
+        device=None,
+    ):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        # we cannot use the config here to parameterize because of a factor 2 for the head_dim
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            device=device, base=config.rope_theta, dim=config.attention_head_dim
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Zamba2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
+    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
+    expressivity with a small memory overhead (see Fig. 2 of https://arxiv.org/pdf/2411.15242).
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        layer_idx: Optional[int] = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.head_dim = config.attention_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.scaling = (self.head_dim / 2) ** -0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.layer_block_map = config.hybrid_layer_ids
+        self.block_id = block_id
+
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for i in range(self.num_fwd_mem_blocks):
+                if i % config.num_mem_blocks == block_id:
+                    linear_q_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_k_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_v_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        self.layer_dic = {value: index for index, value in enumerate(self.layer_block_map)}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.config.use_shared_attention_adapter:
+            adapter_layer_idx = self.layer_dic[layer_idx]
+            query_states = query_states + self.linear_q_adapter_list[adapter_layer_idx](hidden_states)
+            key_states = key_states + self.linear_k_adapter_list[adapter_layer_idx](hidden_states)
+            value_states = value_states + self.linear_v_adapter_list[adapter_layer_idx](hidden_states)
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        if self.config.use_mem_rope:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+# Helper methods for segment sum computation
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """
+    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+
+    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """
+    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
+    simultaneously splitting it into chunk sequences.
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+
+    if len(input_tensor.shape) == 3:
+        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
+
+
+def segment_sum(input_tensor):
+    """
+    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
+    """
+    chunk_size = input_tensor.size(-1)
+    # 1. expand input tensor to have an additional dimension and repeat along that dimension
+    # [..., chunk_size] -> [..., chunk_size, chunk_size]
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    # 3. compute actual cumsum
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+
+    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+class Zamba2MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = int(config.mamba_expand * self.hidden_size)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.use_mem_eff_path = config.use_mem_eff_path
+
+        self.n_groups = config.mamba_ngroups
+        self.head_dim = config.mamba_headdim
+        self.num_heads = self.config.n_mamba_heads
+        self.chunk_size = config.chunk_size
+
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=True,
+            kernel_size=config.mamba_d_conv,
+            groups=self.conv_dim,
+            padding=config.mamba_d_conv - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=config.add_bias_linear,
+        )
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.norm = Zamba2RMSNormGated(
+            self.intermediate_size, group_size=self.intermediate_size // self.n_groups, eps=1e-5
+        )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # set up dimensions for reshapes later
+
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # getting projected states from cache if it exists
+        if cache_params is not None and cache_params.has_previous_state:
+            in_projected_states = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+            d_mlp = (in_projected_states.shape[-1] - d_to_remove) // 2
+            split_projection_dim = [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads]
+            _, _, gate, hidden_states_B_C, dt = torch.split(in_projected_states, split_projection_dim, dim=-1)
+
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+            out = self.out_proj(hidden_states)[:, None, ...]
+        # if no cache is found, calling the kernel
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                dtype = hidden_states.dtype
+                hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+            # 1. Gated MLP's linear projection
+            projected_states = self.in_proj(hidden_states)
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit is None else {"dt_limit": self.time_step_limit}
+            if attention_mask is not None:
+                input_not_masked = torch.all(attention_mask == 1)
+            else:
+                input_not_masked = True
+
+            if self.use_mem_eff_path and self.training and cache_params is None and input_not_masked:
+                out, ssm_state = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=True,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                gate, hidden_states_B_C, time_step = torch.split(
+                    projected_states,
+                    [self.intermediate_size, self.conv_dim, self.num_heads],
+                    dim=-1,
+                )
+
+                # 1D Convolution
+                if cache_params is not None:
+                    hidden_states_B_C_t = hidden_states_B_C.transpose(1, 2)
+                    conv_state = nn.functional.pad(
+                        hidden_states_B_C_t, (self.conv_kernel_size - hidden_states_B_C_t.shape[-1], 0)
+                    )
+                    cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    time_step,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(self, input_states, cache_params: Optional[Zamba2HybridDynamicCache]=None, attention_mask: Optional[torch.Tensor]=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # Gated MLP's linear projection
+        if cache_params is not None and cache_params.has_previous_state:
+            projected_states =  self.in_proj(input_states.squeeze(1))
+        else:
+            if attention_mask is not None and not torch.all(attention_mask==1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    input_states = (input_states * attention_mask[:, :, None]).to(dtype)
+            projected_states =  self.in_proj(input_states)
+        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size -  2 * self.n_groups * self.ssm_state_size- self.num_heads) // 2
+        _, _, gate, hidden_states, dt = projected_states.split(
+                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
+        )
+
+        # Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            if cache_params.has_previous_state:
+                gate = gate.unsqueeze(1)
+                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                # handle batched generation - states are copied through
+                conv_state[:, :, -1] = hidden_states[:, 0, :] if hidden_states.ndim == 3 else hidden_states
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = torch.sum(conv_state.to(projected_states.device) * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype)[:, None, ...]         # [batch, 1, intermediate_size] : decoding
+            else:
+                hidden_states = hidden_states.transpose(1,2)
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = self.act(self.conv1d(hidden_states).transpose(1,2))[:, :seq_len, :]     # [batch, intermediate_size, seq_len]
+                if attention_mask is not None and not torch.all(attention_mask==1):
+                    dtype = hidden_states.dtype
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.num_heads, self.head_dim, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+        hidden_states, B, C = torch.split(hidden_states, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], dim=-1)
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if cache_params is not None and cache_params.has_previous_state:
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, None, ...] if dt.ndim == 2 else dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_min) #, self.time_step_max)
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = torch.exp(dt[..., None] * A)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = dB * hidden_states[..., None]
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_min)
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len,  -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # First, contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, : ,:]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+
+            # Step 2: Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Step 3: Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(3)
+
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+
+            decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
+            B_decay_contraction = B * decay_states.permute(0, 2, 3, 1)[..., None]
+            # permute back B * decay states
+            states = (B_decay_contraction.permute(0, 1, 3, 2, 4)[..., None]  * hidden_states.permute(0, 1, 3, 2, 4)[..., None, :]).sum(dim=3).permute(0, 1, 2, 4, 3)
+            if cache_params is not None and cache_params.has_previous_state:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...]
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+
+            states_permuted = states.permute(0, 2, 1, 3, 4)
+            result = (decay_chunk[..., None, None] * states_permuted[:, :, None, ...]).sum(dim=2)
+            new_states = result.permute(0, 2, 1, 3, 4)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            # compute Yoff
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+
+        return self.torch_forward(hidden_states, cache_params, attention_mask)
+
+
+class Zamba2MLP(nn.Module):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
+        """
+        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
+        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.block_id = block_id
+
+        self.gate_up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=config.add_bias_linear)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for i in range(self.num_fwd_mem_blocks):
+            if i % config.num_mem_blocks == block_id:
+                gate_up_proj_adapter = nn.Sequential(
+                    nn.Linear(self.config.hidden_size, self.config.adapter_rank, bias=False),
+                    nn.Linear(self.config.adapter_rank, 2 * self.intermediate_size, bias=False),
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+        layer_block_map = config.hybrid_layer_ids
+        self.layer_dic = {value: index for index, value in enumerate(layer_block_map)}
+
+    def forward(self, hidden_state, layer_idx=None):
+        gate_up_state = self.gate_up_proj(hidden_state)
+        layer_idx = self.layer_dic[layer_idx]
+        gate_up_state = gate_up_state + self.gate_up_proj_adapter_list[layer_idx](hidden_state)
+
+        gate_up_state = torch.chunk(gate_up_state, 2, dim=-1)
+        hidden_state = self.act_fn(gate_up_state[0]) * gate_up_state[1]
+        output = self.down_proj(hidden_state)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(nn.Module):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.block_id = block_id
+        num_gs = len(config.hybrid_layer_ids)
+        self.self_attn = Zamba2Attention(config, layer_idx=-1, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.feed_forward = Zamba2MLP(config, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.input_layernorm = Zamba2RMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states, layer_idx)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Zamba2MambaDecoderLayer(nn.Module):
+    def __init__(self, config: Zamba2Config, layer_idx: int):
+        super().__init__()
+        self.mamba = Zamba2MambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://arxiv.org/pdf/2405.16712).
+        hidden_states = (
+            hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_value,
+            attention_mask=attention_mask,
+        )
+
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        return outputs
+
+
+class Zamba2HybridLayer(nn.Module):
+    def __init__(
+        self, shared_transformer: Zamba2AttentionDecoderLayer, linear: nn.Linear, mamba: Zamba2MambaDecoderLayer
+    ):
+        super().__init__()
+        self.linear = linear
+        self.mamba_decoder = mamba
+        self.shared_transformer = shared_transformer
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+
+        layer_outputs = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+class Zamba2PreTrainedModel(PreTrainedModel):
+    config_class = Zamba2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
+    _supports_sdpa = True
+    _supports_cache_class = True  # Note: only supports Zamba2HybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (Zamba2RMSNorm, Zamba2RMSNormGated)):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Zamba2MambaMixer):
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.num_heads + 1)
+            module.A_log.data.copy_(torch.log(A))
+            module.D.data.fill_(1.0)
+
+
+ZAMBA2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Zamba2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+ZAMBA2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Zamba2HybridDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            A Zamba2HybridDynamicCache object containing pre-computed hidden-states (keys and values in the
+            self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+            Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
+            Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
+            `(batch_size, d_inner, d_state)` respectively.
+            See the `Zamba2HybridDynamicCache` class for more details.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Zamba2 Model outputting raw hidden-states without any specific head on top.",
+    ZAMBA2_START_DOCSTRING,
+)
+class Zamba2Model(Zamba2PreTrainedModel):
+    """
+    Model consisting of *config.num_hidden_layers* layers.
+
+    Args:
+        config: Zamba2Config
+    """
+
+    def __init__(self, config: Zamba2Config):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        blocks = [Zamba2AttentionDecoderLayer(config, block_id=k) for k in range(config.num_mem_blocks)]
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        blocks = cycle(blocks)
+        layers = self.get_layers(blocks, linear_layers, mamba_layers)
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.use_mem_rope:
+            if config.use_long_context:
+                logger.warning_once(
+                    "`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`."
+                )
+            self.rotary_emb = Zamba2RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+            past_key_values = Zamba2HybridDynamicCache(self.config, batch_size, dtype=self.dtype, device=self.device)
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length(layer_idx=self.first_transformer_layer_id)
+                if past_key_values is not None
+                else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_mem_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def get_layers(self, blocks, linear_layers, mamba_layers):
+        layers = []
+        self._tied_weights_keys = []
+        self.first_transformer_layer_id = 0
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                if self.first_transformer_layer_id == 0:
+                    self.first_transformer_layer_id = layer_id
+                block = next(blocks)
+                if self.config.num_mem_blocks * len(self.config.hybrid_layer_ids) > 1:
+                    prefix_pattern = rf"^layers\.{layer_id}\.shared_transformer\."
+                    main_keys_pattern = re.compile(
+                        prefix_pattern
+                        + r"(?:"
+                        + r"self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|"
+                        + r"feed_forward\.(?:gate_up_proj|down_proj)\.weight|"
+                        + r"(?:input_layernorm|pre_ff_layernorm)\.weight"
+                        + r")$"
+                    )
+                    self._tied_weights_keys.append(main_keys_pattern)
+
+                    adapter_id = 0
+                    for _layer_type in self.layers_block_type:
+                        if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                            adapter_pattern = re.compile(
+                                r"^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\."
+                                + str(adapter_id)
+                                + r"\.(?:0|1)\.weight$"
+                            )
+                            self._tied_weights_keys.append(adapter_pattern)
+                        adapter_id += 1
+                    if self.config.use_shared_attention_adapter:
+                        adapter_id = 0
+                        for _layer_type in self.layers_block_type:
+                            if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                                attn_adapter_pattern = re.compile(
+                                    r"^shared_transformer\.self_attn\."
+                                    + r"(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\."
+                                    + str(adapter_id)
+                                    + r"\.(?:0|1)\.weight$"
+                                )
+                                self._tied_weights_keys.append(attn_adapter_pattern)
+                            adapter_id += 1
+                layers.append(Zamba2HybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        return layers
+
+
+# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba2, JAMBA->ZAMBA2
+class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
+    def __init__(self, config: Zamba2Config):
+        super().__init__(config)
+        self.model = Zamba2Model(config)
+        self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys]
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(ZAMBA2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Zamba2ForCausalLM
+
+        >>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwitten -- has a unique cache type, `Zamba2HybridDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+            # Exception 1: when passing input_embeds, input_ids may be missing entries
+            # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+            # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+            #              (we can't check exception 3 while compiling)
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = Zamba2HybridDynamicCache(
+                self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+        return model_inputs
+
+
+@add_start_docstrings(
+    """
+    The Zamba2 Model with a sequence classification head on top (linear layer).
+
+    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    ZAMBA2_START_DOCSTRING,
+)
+class Zamba2ForSequenceClassification(Zamba2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Zamba2Model(config)
+        self._tied_weights_keys = self.model._tied_weights_keys
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(ZAMBA2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["Zamba2ForCausalLM", "Zamba2ForSequenceClassification", "Zamba2Model", "Zamba2PreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/models/zamba2/modular_zamba2.py b/docs/transformers/build/lib/transformers/models/zamba2/modular_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd0af82658a50753d13e3c3e5c0c2edb20637098
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zamba2/modular_zamba2.py
@@ -0,0 +1,1176 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import re
+from itertools import cycle
+from typing import Callable, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    logging,
+)
+from ...utils.import_utils import (
+    is_causal_conv1d_available,
+    is_mamba_ssm_available,
+)
+from ..llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
+from ..mamba2.modeling_mamba2 import pad_tensor_by_size, reshape_into_chunks, segment_sum
+from ..zamba.modeling_zamba import (
+    ZambaAttention,
+    ZambaAttentionDecoderLayer,
+    ZambaForCausalLM,
+    ZambaForSequenceClassification,
+    ZambaHybridDynamicCache,
+    ZambaHybridLayer,
+    ZambaMambaDecoderLayer,
+    ZambaModel,
+    ZambaRMSNorm,
+    eager_attention_forward,
+)
+from .configuration_zamba2 import Zamba2Config
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+_CONFIG_FOR_DOC = "Zyphra/Zamba2-2.7B"
+
+logger = logging.get_logger(__name__)
+
+
+class Zamba2RMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, group_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        *prefix_dims, last_dim = hidden_states.shape
+        group_count = last_dim // self.group_size
+        hidden_states_group = hidden_states.view(*prefix_dims, group_count, self.group_size)
+        variance = hidden_states_group.pow(2).mean(-1, keepdim=True)
+        hidden_states_group = hidden_states_group * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states_group.view(*prefix_dims, group_count * self.group_size)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Zamba2RMSNorm(ZambaRMSNorm):
+    pass
+
+
+class Zamba2HybridDynamicCache(ZambaHybridDynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(
+        self, config: Zamba2Config, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None
+    ):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False
+        self.intermediate_size = int(config.mamba_expand * config.hidden_size)
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        self.conv_states = {}
+        self.ssm_states = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states[i] = torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_ngroups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=device,
+                dtype=dtype,
+            )
+            self.ssm_states[i] = torch.zeros(
+                batch_size, self.n_mamba_heads, config.mamba_headdim, self.ssm_state_size, device=device, dtype=dtype
+            )
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+
+class Zamba2RotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(
+        self,
+        config: Zamba2Config,
+        device=None,
+    ):
+        super().__init__(config, device)
+        # we cannot use the config here to parameterize because of a factor 2 for the head_dim
+        inv_freq, self.attention_scaling = self.rope_init_fn(
+            device=device, base=config.rope_theta, dim=config.attention_head_dim
+        )
+
+
+class Zamba2Attention(ZambaAttention):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
+    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
+    expressivity with a small memory overhead (see Fig. 2 of https://arxiv.org/pdf/2411.15242).
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        layer_idx: Optional[int] = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
+    ):
+        super().__init__(config, layer_idx)
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.layer_block_map = config.hybrid_layer_ids
+        self.block_id = block_id
+
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for i in range(self.num_fwd_mem_blocks):
+                if i % config.num_mem_blocks == block_id:
+                    linear_q_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_k_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_v_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        self.layer_dic = {value: index for index, value in enumerate(self.layer_block_map)}
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.config.use_shared_attention_adapter:
+            adapter_layer_idx = self.layer_dic[layer_idx]
+            query_states = query_states + self.linear_q_adapter_list[adapter_layer_idx](hidden_states)
+            key_states = key_states + self.linear_k_adapter_list[adapter_layer_idx](hidden_states)
+            value_states = value_states + self.linear_v_adapter_list[adapter_layer_idx](hidden_states)
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        if self.config.use_mem_rope:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Zamba2MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = int(config.mamba_expand * self.hidden_size)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.use_mem_eff_path = config.use_mem_eff_path
+
+        self.n_groups = config.mamba_ngroups
+        self.head_dim = config.mamba_headdim
+        self.num_heads = self.config.n_mamba_heads
+        self.chunk_size = config.chunk_size
+
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=True,
+            kernel_size=config.mamba_d_conv,
+            groups=self.conv_dim,
+            padding=config.mamba_d_conv - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=config.add_bias_linear,
+        )
+        # selective projection used to make dt, B and C input dependant
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.A_log._no_weight_decay = True
+        self.norm = Zamba2RMSNormGated(
+            self.intermediate_size, group_size=self.intermediate_size // self.n_groups, eps=1e-5
+        )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+        self.D._no_weight_decay = True
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # set up dimensions for reshapes later
+
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # getting projected states from cache if it exists
+        if cache_params is not None and cache_params.has_previous_state:
+            in_projected_states = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+            d_mlp = (in_projected_states.shape[-1] - d_to_remove) // 2
+            split_projection_dim = [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads]
+            _, _, gate, hidden_states_B_C, dt = torch.split(in_projected_states, split_projection_dim, dim=-1)
+
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+            out = self.out_proj(hidden_states)[:, None, ...]
+        # if no cache is found, calling the kernel
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                dtype = hidden_states.dtype
+                hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+            # 1. Gated MLP's linear projection
+            projected_states = self.in_proj(hidden_states)
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit is None else {"dt_limit": self.time_step_limit}
+            if attention_mask is not None:
+                input_not_masked = torch.all(attention_mask == 1)
+            else:
+                input_not_masked = True
+
+            if self.use_mem_eff_path and self.training and cache_params is None and input_not_masked:
+                out, ssm_state = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=True,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                gate, hidden_states_B_C, time_step = torch.split(
+                    projected_states,
+                    [self.intermediate_size, self.conv_dim, self.num_heads],
+                    dim=-1,
+                )
+
+                # 1D Convolution
+                if cache_params is not None:
+                    hidden_states_B_C_t = hidden_states_B_C.transpose(1, 2)
+                    conv_state = nn.functional.pad(
+                        hidden_states_B_C_t, (self.conv_kernel_size - hidden_states_B_C_t.shape[-1], 0)
+                    )
+                    cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    time_step,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(self, input_states, cache_params: Optional[Zamba2HybridDynamicCache]=None, attention_mask: Optional[torch.Tensor]=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # Gated MLP's linear projection
+        if cache_params is not None and cache_params.has_previous_state:
+            projected_states =  self.in_proj(input_states.squeeze(1))
+        else:
+            if attention_mask is not None and not torch.all(attention_mask==1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    input_states = (input_states * attention_mask[:, :, None]).to(dtype)
+            projected_states =  self.in_proj(input_states)
+        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size -  2 * self.n_groups * self.ssm_state_size- self.num_heads) // 2
+        _, _, gate, hidden_states, dt = projected_states.split(
+                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
+        )
+
+        # Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            if cache_params.has_previous_state:
+                gate = gate.unsqueeze(1)
+                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                # handle batched generation - states are copied through
+                conv_state[:, :, -1] = hidden_states[:, 0, :] if hidden_states.ndim == 3 else hidden_states
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = torch.sum(conv_state.to(projected_states.device) * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype)[:, None, ...]         # [batch, 1, intermediate_size] : decoding
+            else:
+                hidden_states = hidden_states.transpose(1,2)
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = self.act(self.conv1d(hidden_states).transpose(1,2))[:, :seq_len, :]     # [batch, intermediate_size, seq_len]
+                if attention_mask is not None and not torch.all(attention_mask==1):
+                    dtype = hidden_states.dtype
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.num_heads, self.head_dim, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+        hidden_states, B, C = torch.split(hidden_states, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], dim=-1)
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if cache_params is not None and cache_params.has_previous_state:
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, None, ...] if dt.ndim == 2 else dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_min) #, self.time_step_max)
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = torch.exp(dt[..., None] * A)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = dB * hidden_states[..., None]
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_min)
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len,  -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # First, contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, : ,:]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+
+            # Step 2: Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Step 3: Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(3)
+
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+
+            decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
+            B_decay_contraction = B * decay_states.permute(0, 2, 3, 1)[..., None]
+            # permute back B * decay states
+            states = (B_decay_contraction.permute(0, 1, 3, 2, 4)[..., None]  * hidden_states.permute(0, 1, 3, 2, 4)[..., None, :]).sum(dim=3).permute(0, 1, 2, 4, 3)
+            if cache_params is not None and cache_params.has_previous_state:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...]
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+
+            states_permuted = states.permute(0, 2, 1, 3, 4)
+            result = (decay_chunk[..., None, None] * states_permuted[:, :, None, ...]).sum(dim=2)
+            new_states = result.permute(0, 2, 1, 3, 4)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            # compute Yoff
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+
+        return self.torch_forward(hidden_states, cache_params, attention_mask)
+
+
+class Zamba2MLP(nn.Module):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
+        """
+        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
+        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.block_id = block_id
+
+        self.gate_up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=config.add_bias_linear)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for i in range(self.num_fwd_mem_blocks):
+            if i % config.num_mem_blocks == block_id:
+                gate_up_proj_adapter = nn.Sequential(
+                    nn.Linear(self.config.hidden_size, self.config.adapter_rank, bias=False),
+                    nn.Linear(self.config.adapter_rank, 2 * self.intermediate_size, bias=False),
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+        layer_block_map = config.hybrid_layer_ids
+        self.layer_dic = {value: index for index, value in enumerate(layer_block_map)}
+
+    def forward(self, hidden_state, layer_idx=None):
+        gate_up_state = self.gate_up_proj(hidden_state)
+        layer_idx = self.layer_dic[layer_idx]
+        gate_up_state = gate_up_state + self.gate_up_proj_adapter_list[layer_idx](hidden_state)
+
+        gate_up_state = torch.chunk(gate_up_state, 2, dim=-1)
+        hidden_state = self.act_fn(gate_up_state[0]) * gate_up_state[1]
+        output = self.down_proj(hidden_state)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(ZambaAttentionDecoderLayer):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
+        self.block_id = block_id
+        num_gs = len(config.hybrid_layer_ids)
+        super().__init__(config, layer_idx)
+        self.self_attn = Zamba2Attention(config, layer_idx=-1, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.feed_forward = Zamba2MLP(config, num_fwd_mem_blocks=num_gs, block_id=block_id)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states, layer_idx)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Zamba2MambaDecoderLayer(ZambaMambaDecoderLayer):
+    def __init__(self, config: Zamba2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.mamba = Zamba2MambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class Zamba2HybridLayer(ZambaHybridLayer):
+    def __init__(
+        self, shared_transformer: Zamba2AttentionDecoderLayer, linear: nn.Linear, mamba: Zamba2MambaDecoderLayer
+    ):
+        super().__init__(shared_transformer, linear, mamba)
+        del self.shared_transf
+        self.shared_transformer = shared_transformer
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+
+        layer_outputs = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+class Zamba2PreTrainedModel(PreTrainedModel):
+    config_class = Zamba2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_flex_attn = True
+    _supports_sdpa = True
+    _supports_cache_class = True  # Note: only supports Zamba2HybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, (Zamba2RMSNorm, Zamba2RMSNormGated)):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Zamba2MambaMixer):
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.num_heads + 1)
+            module.A_log.data.copy_(torch.log(A))
+            module.D.data.fill_(1.0)
+
+
+class Zamba2Model(ZambaModel, Zamba2PreTrainedModel):
+    """
+    Model consisting of *config.num_hidden_layers* layers.
+
+    Args:
+        config: Zamba2Config
+    """
+
+    def __init__(self, config: Zamba2Config):
+        Zamba2PreTrainedModel.__init__(self, config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        blocks = [Zamba2AttentionDecoderLayer(config, block_id=k) for k in range(config.num_mem_blocks)]
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        blocks = cycle(blocks)
+        layers = self.get_layers(blocks, linear_layers, mamba_layers)
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.use_mem_rope:
+            if config.use_long_context:
+                logger.warning_once(
+                    "`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`."
+                )
+            self.rotary_emb = Zamba2RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_layers(self, blocks, linear_layers, mamba_layers):
+        layers = []
+        self._tied_weights_keys = []
+        self.first_transformer_layer_id = 0
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                if self.first_transformer_layer_id == 0:
+                    self.first_transformer_layer_id = layer_id
+                block = next(blocks)
+                if self.config.num_mem_blocks * len(self.config.hybrid_layer_ids) > 1:
+                    prefix_pattern = rf"^layers\.{layer_id}\.shared_transformer\."
+                    main_keys_pattern = re.compile(
+                        prefix_pattern
+                        + r"(?:"
+                        + r"self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|"
+                        + r"feed_forward\.(?:gate_up_proj|down_proj)\.weight|"
+                        + r"(?:input_layernorm|pre_ff_layernorm)\.weight"
+                        + r")$"
+                    )
+                    self._tied_weights_keys.append(main_keys_pattern)
+
+                    adapter_id = 0
+                    for _layer_type in self.layers_block_type:
+                        if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                            adapter_pattern = re.compile(
+                                r"^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\."
+                                + str(adapter_id)
+                                + r"\.(?:0|1)\.weight$"
+                            )
+                            self._tied_weights_keys.append(adapter_pattern)
+                        adapter_id += 1
+                    if self.config.use_shared_attention_adapter:
+                        adapter_id = 0
+                        for _layer_type in self.layers_block_type:
+                            if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                                attn_adapter_pattern = re.compile(
+                                    r"^shared_transformer\.self_attn\."
+                                    + r"(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\."
+                                    + str(adapter_id)
+                                    + r"\.(?:0|1)\.weight$"
+                                )
+                                self._tied_weights_keys.append(attn_adapter_pattern)
+                            adapter_id += 1
+                layers.append(Zamba2HybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        return layers
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+            past_key_values = Zamba2HybridDynamicCache(self.config, batch_size, dtype=self.dtype, device=self.device)
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length(layer_idx=self.first_transformer_layer_id)
+                if past_key_values is not None
+                else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_mem_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+
+class Zamba2ForCausalLM(ZambaForCausalLM):
+    pass
+
+
+class Zamba2ForSequenceClassification(ZambaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "Zamba2ForCausalLM",
+    "Zamba2ForSequenceClassification",
+    "Zamba2Model",
+    "Zamba2PreTrainedModel",
+]
diff --git a/docs/transformers/build/lib/transformers/models/zoedepth/__init__.py b/docs/transformers/build/lib/transformers/models/zoedepth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..99879e0f85c2e2ddb9d5d874d19a2b1cc9ee2d82
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zoedepth/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zoedepth import *
+    from .image_processing_zoedepth import *
+    from .modeling_zoedepth import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/models/zoedepth/configuration_zoedepth.py b/docs/transformers/build/lib/transformers/models/zoedepth/configuration_zoedepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..bffedf321234d8b3c14e8c7dadf921ca329c182e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zoedepth/configuration_zoedepth.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ZoeDepth model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto.configuration_auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Intel/zoedepth-nyu": "https://huggingface.co/Intel/zoedepth-nyu/resolve/main/config.json",
+}
+
+
+class ZoeDepthConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ZoeDepthForDepthEstimation`]. It is used to instantiate an ZoeDepth
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the ZoeDepth
+    [Intel/zoedepth-nyu](https://huggingface.co/Intel/zoedepth-nyu) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*, defaults to `BeitConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        readout_type (`str`, *optional*, defaults to `"project"`):
+            The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
+            the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
+
+            - "ignore" simply ignores the CLS token.
+            - "add" passes the information from the CLS token to all other tokens by adding the representations.
+            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
+              projecting the
+            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
+        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
+            The up/downsampling factors of the reassemble layers.
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
+            The hidden sizes to project to for the feature maps of the backbone.
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the heads.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the pre-activate residual units of the fusion blocks.
+        num_relative_features (`int`, *optional*, defaults to 32):
+            The number of features to use in the relative depth estimation head.
+        add_projection (`bool`, *optional*, defaults to `False`):
+            Whether to add a projection layer before the depth estimation head.
+        bottleneck_features (`int`, *optional*, defaults to 256):
+            The number of features in the bottleneck layer.
+        num_attractors (`List[int], *optional*, defaults to `[16, 8, 4, 1]`):
+            The number of attractors to use in each stage.
+        bin_embedding_dim (`int`, *optional*, defaults to 128):
+            The dimension of the bin embeddings.
+        attractor_alpha (`int`, *optional*, defaults to 1000):
+            The alpha value to use in the attractor.
+        attractor_gamma (`int`, *optional*, defaults to 2):
+            The gamma value to use in the attractor.
+        attractor_kind (`str`, *optional*, defaults to `"mean"`):
+            The kind of attractor to use. Can be one of [`"mean"`, `"sum"`].
+        min_temp (`float`, *optional*, defaults to 0.0212):
+            The minimum temperature value to consider.
+        max_temp (`float`, *optional*, defaults to 50.0):
+            The maximum temperature value to consider.
+        bin_centers_type (`str`, *optional*, defaults to `"softplus"`):
+            Activation type used for bin centers. Can be "normed" or "softplus". For "normed" bin centers, linear normalization trick
+            is applied. This results in bounded bin centers. For "softplus", softplus activation is used and thus are unbounded.
+        bin_configurations (`List[dict]`, *optional*, defaults to `[{'n_bins': 64, 'min_depth': 0.001, 'max_depth': 10.0}]`):
+            Configuration for each of the bin heads.
+            Each configuration should consist of the following keys:
+            - name (`str`): The name of the bin head - only required in case of multiple bin configurations.
+            - `n_bins` (`int`): The number of bins to use.
+            - `min_depth` (`float`): The minimum depth value to consider.
+            - `max_depth` (`float`): The maximum depth value to consider.
+            In case only a single configuration is passed, the model will use a single head with the specified configuration.
+            In case multiple configurations are passed, the model will use multiple heads with the specified configurations.
+        num_patch_transformer_layers (`int`, *optional*):
+            The number of transformer layers to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_hidden_size (`int`, *optional*):
+            The hidden size to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_intermediate_size (`int`, *optional*):
+            The intermediate size to use in the patch transformer. Only used in case of multiple bin configurations.
+        patch_transformer_num_attention_heads (`int`, *optional*):
+            The number of attention heads to use in the patch transformer. Only used in case of multiple bin configurations.
+
+    Example:
+
+    ```python
+    >>> from transformers import ZoeDepthConfig, ZoeDepthForDepthEstimation
+
+    >>> # Initializing a ZoeDepth zoedepth-large style configuration
+    >>> configuration = ZoeDepthConfig()
+
+    >>> # Initializing a model from the zoedepth-large style configuration
+    >>> model = ZoeDepthForDepthEstimation(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "zoedepth"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        backbone_kwargs=None,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        batch_norm_eps=1e-05,
+        readout_type="project",
+        reassemble_factors=[4, 2, 1, 0.5],
+        neck_hidden_sizes=[96, 192, 384, 768],
+        fusion_hidden_size=256,
+        head_in_index=-1,
+        use_batch_norm_in_fusion_residual=False,
+        use_bias_in_fusion_residual=None,
+        num_relative_features=32,
+        add_projection=False,
+        bottleneck_features=256,
+        num_attractors=[16, 8, 4, 1],
+        bin_embedding_dim=128,
+        attractor_alpha=1000,
+        attractor_gamma=2,
+        attractor_kind="mean",
+        min_temp=0.0212,
+        max_temp=50.0,
+        bin_centers_type="softplus",
+        bin_configurations=[{"n_bins": 64, "min_depth": 0.001, "max_depth": 10.0}],
+        num_patch_transformer_layers=None,
+        patch_transformer_hidden_size=None,
+        patch_transformer_intermediate_size=None,
+        patch_transformer_num_attention_heads=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if readout_type not in ["ignore", "add", "project"]:
+            raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
+
+        if attractor_kind not in ["mean", "sum"]:
+            raise ValueError("Attractor_kind must be one of ['mean', 'sum']")
+
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `BEiT` backbone.")
+            backbone_config = CONFIG_MAPPING["beit"](
+                image_size=384,
+                num_hidden_layers=24,
+                hidden_size=1024,
+                intermediate_size=4096,
+                num_attention_heads=16,
+                use_relative_position_bias=True,
+                reshape_hidden_states=False,
+                out_features=["stage6", "stage12", "stage18", "stage24"],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.hidden_act = hidden_act
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.initializer_range = initializer_range
+        self.batch_norm_eps = batch_norm_eps
+        self.readout_type = readout_type
+        self.reassemble_factors = reassemble_factors
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.head_in_index = head_in_index
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        self.use_bias_in_fusion_residual = use_bias_in_fusion_residual
+        self.num_relative_features = num_relative_features
+        self.add_projection = add_projection
+
+        self.bottleneck_features = bottleneck_features
+        self.num_attractors = num_attractors
+        self.bin_embedding_dim = bin_embedding_dim
+        self.attractor_alpha = attractor_alpha
+        self.attractor_gamma = attractor_gamma
+        self.attractor_kind = attractor_kind
+        self.min_temp = min_temp
+        self.max_temp = max_temp
+        self.bin_centers_type = bin_centers_type
+        self.bin_configurations = bin_configurations
+        self.num_patch_transformer_layers = num_patch_transformer_layers
+        self.patch_transformer_hidden_size = patch_transformer_hidden_size
+        self.patch_transformer_intermediate_size = patch_transformer_intermediate_size
+        self.patch_transformer_num_attention_heads = patch_transformer_num_attention_heads
+
+
+__all__ = ["ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP", "ZoeDepthConfig"]
diff --git a/docs/transformers/build/lib/transformers/models/zoedepth/convert_zoedepth_to_hf.py b/docs/transformers/build/lib/transformers/models/zoedepth/convert_zoedepth_to_hf.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf47a636b7d89740549072bb54cf52e37deeabb
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zoedepth/convert_zoedepth_to_hf.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ZoeDepth checkpoints from the original repository. URL: https://github.com/isl-org/ZoeDepth.
+
+Original logits where obtained by running the following code:
+!git clone -b understanding_zoedepth https://github.com/NielsRogge/ZoeDepth
+!python inference.py
+"""
+
+import argparse
+from pathlib import Path
+
+import torch
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+from transformers import BeitConfig, ZoeDepthConfig, ZoeDepthForDepthEstimation, ZoeDepthImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_zoedepth_config(model_name):
+    image_size = 384
+    backbone_config = BeitConfig(
+        image_size=image_size,
+        num_hidden_layers=24,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_attention_heads=16,
+        use_relative_position_bias=True,
+        reshape_hidden_states=False,
+        out_features=["stage6", "stage12", "stage18", "stage24"],  # beit-large-512 uses [5, 11, 17, 23],
+    )
+
+    neck_hidden_sizes = [256, 512, 1024, 1024]
+    bin_centers_type = "softplus" if model_name in ["ZoeD_N", "ZoeD_NK"] else "normed"
+    if model_name == "ZoeD_NK":
+        bin_configurations = [
+            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
+            {"name": "kitti", "n_bins": 64, "min_depth": 1e-3, "max_depth": 80.0},
+        ]
+    elif model_name in ["ZoeD_N", "ZoeD_K"]:
+        bin_configurations = [
+            {"name": "nyu", "n_bins": 64, "min_depth": 1e-3, "max_depth": 10.0},
+        ]
+    config = ZoeDepthConfig(
+        backbone_config=backbone_config,
+        neck_hidden_sizes=neck_hidden_sizes,
+        bin_centers_type=bin_centers_type,
+        bin_configurations=bin_configurations,
+        num_patch_transformer_layers=4 if model_name == "ZoeD_NK" else None,
+        patch_transformer_hidden_size=128 if model_name == "ZoeD_NK" else None,
+        patch_transformer_intermediate_size=1024 if model_name == "ZoeD_NK" else None,
+        patch_transformer_num_attention_heads=4 if model_name == "ZoeD_NK" else None,
+    )
+
+    return config, image_size
+
+
+def rename_key(name):
+    # Transformer backbone
+    if "core.core.pretrained.model.blocks" in name:
+        name = name.replace("core.core.pretrained.model.blocks", "backbone.encoder.layer")
+    if "core.core.pretrained.model.patch_embed.proj" in name:
+        name = name.replace(
+            "core.core.pretrained.model.patch_embed.proj", "backbone.embeddings.patch_embeddings.projection"
+        )
+    if "core.core.pretrained.model.cls_token" in name:
+        name = name.replace("core.core.pretrained.model.cls_token", "backbone.embeddings.cls_token")
+    if "norm1" in name and "patch_transformer" not in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name and "patch_transformer" not in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "gamma_1" in name:
+        name = name.replace("gamma_1", "lambda_1")
+    if "gamma_2" in name:
+        name = name.replace("gamma_2", "lambda_2")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn.relative_position_bias_table" in name:
+        name = name.replace(
+            "attn.relative_position_bias_table",
+            "attention.attention.relative_position_bias.relative_position_bias_table",
+        )
+    if "attn.relative_position_index" in name:
+        name = name.replace(
+            "attn.relative_position_index", "attention.attention.relative_position_bias.relative_position_index"
+        )
+
+    # activation postprocessing (readout projections + resize blocks)
+    if "core.core.pretrained.act_postprocess1.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess1.0.project", "neck.reassemble_stage.readout_projects.0"
+        )
+    if "core.core.pretrained.act_postprocess2.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess2.0.project", "neck.reassemble_stage.readout_projects.1"
+        )
+    if "core.core.pretrained.act_postprocess3.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess3.0.project", "neck.reassemble_stage.readout_projects.2"
+        )
+    if "core.core.pretrained.act_postprocess4.0.project" in name:
+        name = name.replace(
+            "core.core.pretrained.act_postprocess4.0.project", "neck.reassemble_stage.readout_projects.3"
+        )
+
+    if "core.core.pretrained.act_postprocess1.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess1.3", "neck.reassemble_stage.layers.0.projection")
+    if "core.core.pretrained.act_postprocess2.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess2.3", "neck.reassemble_stage.layers.1.projection")
+    if "core.core.pretrained.act_postprocess3.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess3.3", "neck.reassemble_stage.layers.2.projection")
+    if "core.core.pretrained.act_postprocess4.3" in name:
+        name = name.replace("core.core.pretrained.act_postprocess4.3", "neck.reassemble_stage.layers.3.projection")
+
+    if "core.core.pretrained.act_postprocess1.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess1.4", "neck.reassemble_stage.layers.0.resize")
+    if "core.core.pretrained.act_postprocess2.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess2.4", "neck.reassemble_stage.layers.1.resize")
+    if "core.core.pretrained.act_postprocess4.4" in name:
+        name = name.replace("core.core.pretrained.act_postprocess4.4", "neck.reassemble_stage.layers.3.resize")
+
+    # scratch convolutions
+    if "core.core.scratch.layer1_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer1_rn.weight", "neck.convs.0.weight")
+    if "core.core.scratch.layer2_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer2_rn.weight", "neck.convs.1.weight")
+    if "core.core.scratch.layer3_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer3_rn.weight", "neck.convs.2.weight")
+    if "core.core.scratch.layer4_rn.weight" in name:
+        name = name.replace("core.core.scratch.layer4_rn.weight", "neck.convs.3.weight")
+
+    # fusion layers
+    # tricky here: mapping = {1:3, 2:2, 3:1, 4:0}
+    if "core.core.scratch.refinenet1" in name:
+        name = name.replace("core.core.scratch.refinenet1", "neck.fusion_stage.layers.3")
+    if "core.core.scratch.refinenet2" in name:
+        name = name.replace("core.core.scratch.refinenet2", "neck.fusion_stage.layers.2")
+    if "core.core.scratch.refinenet3" in name:
+        name = name.replace("core.core.scratch.refinenet3", "neck.fusion_stage.layers.1")
+    if "core.core.scratch.refinenet4" in name:
+        name = name.replace("core.core.scratch.refinenet4", "neck.fusion_stage.layers.0")
+
+    if "resConfUnit1" in name:
+        name = name.replace("resConfUnit1", "residual_layer1")
+
+    if "resConfUnit2" in name:
+        name = name.replace("resConfUnit2", "residual_layer2")
+
+    if "conv1" in name:
+        name = name.replace("conv1", "convolution1")
+
+    if "conv2" in name and "residual_layer" in name:
+        name = name.replace("conv2", "convolution2")
+
+    if "out_conv" in name:
+        name = name.replace("out_conv", "projection")
+
+    # relative depth estimation head
+    if "core.core.scratch.output_conv.0" in name:
+        name = name.replace("core.core.scratch.output_conv.0", "relative_head.conv1")
+
+    if "core.core.scratch.output_conv.2" in name:
+        name = name.replace("core.core.scratch.output_conv.2", "relative_head.conv2")
+
+    if "core.core.scratch.output_conv.4" in name:
+        name = name.replace("core.core.scratch.output_conv.4", "relative_head.conv3")
+
+    # patch transformer
+    if "patch_transformer" in name:
+        name = name.replace("patch_transformer", "metric_head.patch_transformer")
+
+    if "mlp_classifier.0" in name:
+        name = name.replace("mlp_classifier.0", "metric_head.mlp_classifier.linear1")
+    if "mlp_classifier.2" in name:
+        name = name.replace("mlp_classifier.2", "metric_head.mlp_classifier.linear2")
+
+    if "projectors" in name:
+        name = name.replace("projectors", "metric_head.projectors")
+
+    if "seed_bin_regressors" in name:
+        name = name.replace("seed_bin_regressors", "metric_head.seed_bin_regressors")
+
+    if "seed_bin_regressor" in name and "seed_bin_regressors" not in name:
+        name = name.replace("seed_bin_regressor", "metric_head.seed_bin_regressor")
+
+    if "seed_projector" in name:
+        name = name.replace("seed_projector", "metric_head.seed_projector")
+
+    if "_net.0" in name:
+        name = name.replace("_net.0", "conv1")
+
+    if "_net.2" in name:
+        name = name.replace("_net.2", "conv2")
+
+    if "attractors" in name:
+        name = name.replace("attractors", "metric_head.attractors")
+
+    if "conditional_log_binomial" in name:
+        name = name.replace("conditional_log_binomial", "metric_head.conditional_log_binomial")
+
+    # metric depth estimation head
+    if "conv2" in name and "metric_head" not in name and "attractors" not in name and "relative_head" not in name:
+        name = name.replace("conv2", "metric_head.conv2")
+
+    if "transformer_encoder.layers" in name:
+        name = name.replace("transformer_encoder.layers", "transformer_encoder")
+
+    return name
+
+
+def read_in_q_k_v_metric_head(state_dict):
+    hidden_size = 128
+    for i in range(4):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_weight")
+        in_proj_bias = state_dict.pop(f"patch_transformer.transformer_encoder.layers.{i}.self_attn.in_proj_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.weight"] = in_proj_weight[
+            :hidden_size, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.query.bias"] = in_proj_bias[:hidden_size]
+
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.key.bias"] = in_proj_bias[
+            hidden_size : hidden_size * 2
+        ]
+
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.weight"] = in_proj_weight[
+            -hidden_size:, :
+        ]
+        state_dict[f"patch_transformer.transformer_encoder.{i}.self_attn.value.bias"] = in_proj_bias[-hidden_size:]
+
+
+def convert_state_dict(orig_state_dict):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        # rename key
+        new_name = rename_key(key)
+        orig_state_dict[new_name] = val
+
+    return orig_state_dict
+
+
+def remove_ignore_keys(state_dict):
+    for key, _ in state_dict.copy().items():
+        if (
+            "fc_norm" in key
+            or "relative_position_index" in key
+            or "k_idx" in key
+            or "K_minus_1" in key
+            or "core.core.pretrained.model.head" in key
+        ):
+            state_dict.pop(key, None)
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    hidden_size = config.backbone_config.hidden_size
+    for i in range(config.backbone_config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.qkv.weight")
+        q_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"core.core.pretrained.model.blocks.{i}.attn.v_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
+
+
+# We will verify our results on an image
+def prepare_img():
+    filepath = hf_hub_download(repo_id="shariqfarooq/ZoeDepth", filename="examples/person_1.jpeg", repo_type="space")
+    image = Image.open(filepath).convert("RGB")
+    return image
+
+
+@torch.no_grad()
+def convert_zoedepth_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our ZoeDepth structure.
+    """
+
+    # define ZoeDepth configuration based on URL
+    config, _ = get_zoedepth_config(model_name)
+
+    # load original model
+    original_model = torch.hub.load(
+        "NielsRogge/ZoeDepth:understanding_zoedepth", model_name, pretrained=True, force_reload=True
+    )
+    original_model.eval()
+    state_dict = original_model.state_dict()
+
+    print("Original state dict:")
+    for name, param in state_dict.items():
+        print(name, param.shape)
+
+    # read in qkv matrices
+    read_in_q_k_v(state_dict, config)
+    if model_name == "ZoeD_NK":
+        read_in_q_k_v_metric_head(state_dict)
+
+    # rename keys
+    state_dict = convert_state_dict(state_dict)
+    # remove certain keys
+    remove_ignore_keys(state_dict)
+
+    # load HuggingFace model
+    model = ZoeDepthForDepthEstimation(config)
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # verify image processor
+    image = prepare_img()
+
+    image_processor = ZoeDepthImageProcessor()
+    pixel_values = image_processor(image, return_tensors="pt").pixel_values
+    filepath = hf_hub_download(
+        repo_id="nielsr/test-image",
+        filename="zoedepth_pixel_values.pt",
+        repo_type="dataset",
+    )
+    original_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True)
+    assert torch.allclose(pixel_values, original_pixel_values)
+
+    # verify logits
+    # this was done on a resized version of the cats image (384x384)
+    filepath = hf_hub_download(
+        repo_id="nielsr/test-image",
+        filename="zoedepth_pixel_values.pt",
+        repo_type="dataset",
+        revision="1865dbb81984f01c89e83eec10f8d07efd10743d",
+    )
+    cats_pixel_values = torch.load(filepath, map_location="cpu", weights_only=True)
+    depth = model(cats_pixel_values).predicted_depth
+
+    # Verify logits
+    # These were obtained by inserting the pixel_values at the patch embeddings of BEiT
+    if model_name == "ZoeD_N":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.0328, 1.0604, 1.0747], [1.0816, 1.1293, 1.1456], [1.1117, 1.1629, 1.1766]])
+    elif model_name == "ZoeD_K":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.6567, 1.6852, 1.7065], [1.6707, 1.6764, 1.6713], [1.7195, 1.7166, 1.7118]])
+    elif model_name == "ZoeD_NK":
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor([[1.1228, 1.1079, 1.1382], [1.1807, 1.1658, 1.1891], [1.2344, 1.2094, 1.2317]])
+
+    print("Shape of depth:", depth.shape)
+    print("First 3x3 slice of depth:", depth[0, :3, :3])
+
+    assert depth.shape == torch.Size(expected_shape)
+    assert torch.allclose(depth[0, :3, :3], expected_slice, atol=1e-4)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        model.save_pretrained(pytorch_dump_folder_path)
+        image_processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        model_name_to_repo_id = {
+            "ZoeD_N": "zoedepth-nyu",
+            "ZoeD_K": "zoedepth-kitti",
+            "ZoeD_NK": "zoedepth-nyu-kitti",
+        }
+
+        print("Pushing model and processor to the hub...")
+        repo_id = model_name_to_repo_id[model_name]
+        model.push_to_hub(f"Intel/{repo_id}")
+        image_processor = ZoeDepthImageProcessor()
+        image_processor.push_to_hub(f"Intel/{repo_id}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="ZoeD_N",
+        choices=["ZoeD_N", "ZoeD_K", "ZoeD_NK"],
+        type=str,
+        help="Name of the original ZoeDepth checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        required=False,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+    )
+
+    args = parser.parse_args()
+    convert_zoedepth_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/docs/transformers/build/lib/transformers/models/zoedepth/image_processing_zoedepth.py b/docs/transformers/build/lib/transformers/models/zoedepth/image_processing_zoedepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..d40b938403caa70ea5ececd59ce22b6af711ab71
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zoedepth/image_processing_zoedepth.py
@@ -0,0 +1,561 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ZoeDepth."""
+
+import math
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+
+if TYPE_CHECKING:
+    from .modeling_zoedepth import ZoeDepthDepthEstimatorOutput
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import PaddingMode, pad, to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+
+
+if is_vision_available():
+    import PIL
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    output_size: Union[int, Iterable[int]],
+    keep_aspect_ratio: bool,
+    multiple: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    def constrain_to_multiple_of(val, multiple, min_val=0):
+        x = (np.round(val / multiple) * multiple).astype(int)
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
+
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constrain_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constrain_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return (new_height, new_width)
+
+
+class ZoeDepthImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ZoeDepth image processor.
+
+    Args:
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to apply pad the input.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 512}`):
+            Size of the image after resizing. Size of the image after resizing. If `keep_aspect_ratio` is `True`,
+            the image is resized by choosing the smaller of the height and width scaling factors and using it for both dimensions.
+            If `ensure_multiple_of` is also set, the image is further resized to a size that is a multiple of this value.
+            Can be overidden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+        keep_aspect_ratio (`bool`, *optional*, defaults to `True`):
+            If `True`, the image is resized by choosing the smaller of the height and width scaling factors and using it
+            for both dimensions. This ensures that the image is scaled down as little as possible while still fitting
+            within the desired output size. In case `ensure_multiple_of` is also set, the image is further resized to a
+            size that is a multiple of this value by flooring the height and width to the nearest multiple of this value.
+            Can be overidden by `keep_aspect_ratio` in `preprocess`.
+        ensure_multiple_of (`int`, *optional*, defaults to 32):
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by flooring
+            the height and width to the nearest multiple of this value.
+
+            Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by `ensure_multiple_of`
+            in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_pad: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        keep_aspect_ratio: bool = True,
+        ensure_multiple_of: int = 32,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        size = size if size is not None else {"height": 384, "width": 512}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.ensure_multiple_of = ensure_multiple_of
+        self.resample = resample
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to 1):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
+                specified in `size`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        data_format = data_format if data_format is not None else input_data_format
+
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size["height"], size["width"]),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+            input_data_format=input_data_format,
+        )
+
+        height, width = output_size
+
+        torch_image = torch.from_numpy(image).unsqueeze(0)
+        torch_image = torch_image.permute(0, 3, 1, 2) if input_data_format == "channels_last" else torch_image
+
+        # TODO support align_corners=True in image_transforms.resize
+        requires_backends(self, "torch")
+        resample_to_mode = {PILImageResampling.BILINEAR: "bilinear", PILImageResampling.BICUBIC: "bicubic"}
+        mode = resample_to_mode[resample]
+        resized_image = nn.functional.interpolate(
+            torch_image, (int(height), int(width)), mode=mode, align_corners=True
+        )
+        resized_image = resized_image.squeeze().numpy()
+
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.FIRST
+        )
+
+        return resized_image
+
+    def pad_image(
+        self,
+        image: np.array,
+        mode: PaddingMode = PaddingMode.REFLECT,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pad an image as done in the original ZoeDepth implementation.
+
+        Padding fixes the boundary artifacts in the output depth map.
+        Boundary artifacts are sometimes caused by the fact that the model is trained on NYU raw dataset
+        which has a black or white border around the image. This function pads the input image and crops
+        the prediction back to the original size / view.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            mode (`PaddingMode`):
+                The padding mode to use. Can be one of:
+                    - `"constant"`: pads with a constant value.
+                    - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                    vector along each axis.
+                    - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                    - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        height, width = get_image_size(image, input_data_format)
+
+        pad_height = int(np.sqrt(height / 2) * 3)
+        pad_width = int(np.sqrt(width / 2) * 3)
+
+        return pad(
+            image,
+            padding=((pad_height, pad_height), (pad_width, pad_width)),
+            mode=mode,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_pad: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[int] = None,
+        keep_aspect_ratio: Optional[bool] = None,
+        ensure_multiple_of: Optional[int] = None,
+        resample: PILImageResampling = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the input image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. If `keep_aspect_ratio` is `True`, he image is resized by choosing the
+                smaller of the height and width scaling factors and using it for both dimensions. If `ensure_multiple_of`
+                is also set, the image is further resized to a size that is a multiple of this value.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
+                If `True` and `do_resize=True`, the image is resized by choosing the smaller of the height and width
+                scaling factors and using it for both dimensions. This ensures that the image is scaled down as little
+                as possible while still fitting within the desired output size. In case `ensure_multiple_of` is also
+                set, the image is further resized to a size that is a multiple of this value by flooring the height and
+                width to the nearest multiple of this value.
+            ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
+                If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Works by
+                flooring the height and width to the nearest multiple of this value.
+
+                Works both with and without `keep_aspect_ratio` being set to `True`. Can be overidden by
+                `ensure_multiple_of` in `preprocess`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
+        ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_pad:
+            images = [self.pad_image(image=image, input_data_format=input_data_format) for image in images]
+
+        if do_resize:
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    resample=resample,
+                    keep_aspect_ratio=keep_aspect_ratio,
+                    ensure_multiple_of=ensure_multiple_of,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "ZoeDepthDepthEstimatorOutput",
+        source_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+        target_sizes: Optional[Union[TensorType, List[Tuple[int, int]], None]] = None,
+        outputs_flipped: Optional[Union["ZoeDepthDepthEstimatorOutput", None]] = None,
+        do_remove_padding: Optional[Union[bool, None]] = None,
+    ) -> List[Dict[str, TensorType]]:
+        """
+        Converts the raw output of [`ZoeDepthDepthEstimatorOutput`] into final depth predictions and depth PIL images.
+        Only supports PyTorch.
+
+        Args:
+            outputs ([`ZoeDepthDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            source_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the source size
+                (height, width) of each image in the batch before preprocessing. This argument should be dealt as
+                "required" unless the user passes `do_remove_padding=False` as input to this function.
+            target_sizes (`TensorType` or `List[Tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            outputs_flipped ([`ZoeDepthDepthEstimatorOutput`], *optional*):
+                Raw outputs of the model from flipped input (averaged out in the end).
+            do_remove_padding (`bool`, *optional*):
+                By default ZoeDepth addes padding equal to `int(√(height / 2) * 3)` (and similarly for width) to fix the
+                boundary artifacts in the output depth map, so we need remove this padding during post_processing. The
+                parameter exists here in case the user changed the image preprocessing to not include padding.
+
+        Returns:
+            `List[Dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+
+        if (outputs_flipped is not None) and (predicted_depth.shape != outputs_flipped.predicted_depth.shape):
+            raise ValueError("Make sure that `outputs` and `outputs_flipped` have the same shape")
+
+        if (target_sizes is not None) and (len(predicted_depth) != len(target_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the predicted depth"
+            )
+
+        if do_remove_padding is None:
+            do_remove_padding = self.do_pad
+
+        if source_sizes is None and do_remove_padding:
+            raise ValueError(
+                "Either `source_sizes` should be passed in, or `do_remove_padding` should be set to False"
+            )
+
+        if (source_sizes is not None) and (len(predicted_depth) != len(source_sizes)):
+            raise ValueError(
+                "Make sure that you pass in as many source image sizes as the batch dimension of the logits"
+            )
+
+        if outputs_flipped is not None:
+            predicted_depth = (predicted_depth + torch.flip(outputs_flipped.predicted_depth, dims=[-1])) / 2
+
+        predicted_depth = predicted_depth.unsqueeze(1)
+
+        # Zoe Depth model adds padding around the images to fix the boundary artifacts in the output depth map
+        # The padding length is `int(np.sqrt(img_h/2) * fh)` for the height and similar for the width
+        # fh (and fw respectively) are equal to '3' by default
+        # Check [here](https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L57)
+        # for the original implementation.
+        # In this section, we remove this padding to get the final depth image and depth prediction
+        padding_factor_h = padding_factor_w = 3
+
+        results = []
+        target_sizes = [None] * len(predicted_depth) if target_sizes is None else target_sizes
+        source_sizes = [None] * len(predicted_depth) if source_sizes is None else source_sizes
+        for depth, target_size, source_size in zip(predicted_depth, target_sizes, source_sizes):
+            # depth.shape = [1, H, W]
+            if source_size is not None:
+                pad_h = pad_w = 0
+
+                if do_remove_padding:
+                    pad_h = int(np.sqrt(source_size[0] / 2) * padding_factor_h)
+                    pad_w = int(np.sqrt(source_size[1] / 2) * padding_factor_w)
+
+                depth = nn.functional.interpolate(
+                    depth.unsqueeze(1),
+                    size=[source_size[0] + 2 * pad_h, source_size[1] + 2 * pad_w],
+                    mode="bicubic",
+                    align_corners=False,
+                )
+
+                if pad_h > 0:
+                    depth = depth[:, :, pad_h:-pad_h, :]
+                if pad_w > 0:
+                    depth = depth[:, :, :, pad_w:-pad_w]
+
+                depth = depth.squeeze(1)
+            # depth.shape = [1, H, W]
+            if target_size is not None:
+                target_size = [target_size[0], target_size[1]]
+                depth = nn.functional.interpolate(
+                    depth.unsqueeze(1), size=target_size, mode="bicubic", align_corners=False
+                )
+            depth = depth.squeeze()
+            # depth.shape = [H, W]
+            results.append({"predicted_depth": depth})
+
+        return results
+
+
+__all__ = ["ZoeDepthImageProcessor"]
diff --git a/docs/transformers/build/lib/transformers/models/zoedepth/modeling_zoedepth.py b/docs/transformers/build/lib/transformers/models/zoedepth/modeling_zoedepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f2e1a86568fbe3102b637ed360d6d976098d41d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/models/zoedepth/modeling_zoedepth.py
@@ -0,0 +1,1406 @@
+# coding=utf-8
+# Copyright 2024 Intel Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ZoeDepth model."""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import DepthEstimatorOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, logging
+from ...utils.backbone_utils import load_backbone
+from .configuration_zoedepth import ZoeDepthConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ZoeDepthConfig"
+
+
+@dataclass
+class ZoeDepthDepthEstimatorOutput(ModelOutput):
+    """
+    Extension of `DepthEstimatorOutput` to include domain logits (ZoeDepth specific).
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+
+        domain_logits (`torch.FloatTensor` of shape `(batch_size, num_domains)`):
+            Logits for each domain (e.g. NYU and KITTI) in case multiple metric heads are used.
+
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: Optional[torch.FloatTensor] = None
+    domain_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+
+
+class ZoeDepthReassembleStage(nn.Module):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+
+    This happens in 3 stages:
+    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
+       `config.readout_type`.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.readout_type = config.readout_type
+        self.layers = nn.ModuleList()
+
+        for neck_hidden_size, factor in zip(config.neck_hidden_sizes, config.reassemble_factors):
+            self.layers.append(ZoeDepthReassembleLayer(config, channels=neck_hidden_size, factor=factor))
+
+        if config.readout_type == "project":
+            self.readout_projects = nn.ModuleList()
+            hidden_size = config.backbone_hidden_size
+            for _ in config.neck_hidden_sizes:
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * hidden_size, hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        batch_size = hidden_states[0].shape[0]
+
+        # stack along batch dimension
+        # shape (batch_size*num_stages, sequence_length + 1, hidden_size)
+        hidden_states = torch.cat(hidden_states, dim=0)
+
+        cls_token, hidden_states = hidden_states[:, 0], hidden_states[:, 1:]
+        # reshape hidden_states to (batch_size*num_stages, num_channels, height, width)
+        total_batch_size, sequence_length, num_channels = hidden_states.shape
+        hidden_states = hidden_states.reshape(total_batch_size, patch_height, patch_width, num_channels)
+        hidden_states = hidden_states.permute(0, 3, 1, 2).contiguous()
+
+        if self.readout_type == "project":
+            # reshape to (batch_size*num_stages, height*width, num_channels)
+            hidden_states = hidden_states.flatten(2).permute((0, 2, 1))
+            readout = cls_token.unsqueeze(dim=1).expand_as(hidden_states)
+            # concatenate the readout token to the hidden states
+            # to get (batch_size*num_stages, height*width, 2*num_channels)
+            hidden_states = torch.cat((hidden_states, readout), -1)
+        elif self.readout_type == "add":
+            hidden_states = hidden_states + cls_token.unsqueeze(-1)
+
+        out = []
+        for stage_idx, hidden_state in enumerate(hidden_states.split(batch_size, dim=0)):
+            if self.readout_type == "project":
+                hidden_state = self.readout_projects[stage_idx](hidden_state)
+
+            # reshape back to (batch_size, num_channels, height, width)
+            hidden_state = hidden_state.permute(0, 2, 1).reshape(batch_size, -1, patch_height, patch_width)
+            hidden_state = self.layers[stage_idx](hidden_state)
+            out.append(hidden_state)
+
+        return out
+
+
+class ZoeDepthReassembleLayer(nn.Module):
+    def __init__(self, config, channels, factor):
+        super().__init__()
+        # projection
+        hidden_size = config.backbone_hidden_size
+        self.projection = nn.Conv2d(in_channels=hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.ConvTranspose2d(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            self.resize = nn.Conv2d(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+
+    # Copied from transformers.models.dpt.modeling_dpt.DPTReassembleLayer.forward with DPT->ZoeDepth
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->ZoeDepth
+class ZoeDepthFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(ZoeDepthFeatureFusionLayer(config))
+
+    def forward(self, hidden_states):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states, self.layers):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer with DPT->ZoeDepth
+class ZoeDepthPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    # Ignore copy
+    def __init__(self, config):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size, eps=config.batch_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer with DPT->ZoeDepth
+class ZoeDepthFeatureFusionLayer(nn.Module):
+    """Feature fusion layer, merges feature maps from different stages.
+
+    Args:
+        config (`[ZoeDepthConfig]`):
+            Model configuration class defining the model architecture.
+        align_corners (`bool`, *optional*, defaults to `True`):
+            The align_corner setting for bilinear upsample.
+    """
+
+    def __init__(self, config, align_corners=True):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+        self.residual_layer1 = ZoeDepthPreActResidualLayer(config)
+        self.residual_layer2 = ZoeDepthPreActResidualLayer(config)
+
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = nn.functional.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+        hidden_state = nn.functional.interpolate(
+            hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+class ZoeDepthNeck(nn.Module):
+    """
+    ZoeDepthNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For ZoeDepth, it includes 2 stages:
+
+    * ZoeDepthReassembleStage
+    * ZoeDepthFeatureFusionStage.
+
+    Args:
+        config (dict): config dict.
+    """
+
+    # Copied from transformers.models.dpt.modeling_dpt.DPTNeck.__init__ with DPT->ZoeDepth
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        # postprocessing: only required in case of a non-hierarchical backbone (e.g. ViT, BEiT)
+        if config.backbone_config is not None and config.backbone_config.model_type in ["swinv2"]:
+            self.reassemble_stage = None
+        else:
+            self.reassemble_stage = ZoeDepthReassembleStage(config)
+
+        self.convs = nn.ModuleList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2d(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias=False))
+
+        # fusion
+        self.fusion_stage = ZoeDepthFeatureFusionStage(config)
+
+    def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) -> List[torch.Tensor]:
+        """
+        Args:
+            hidden_states (`List[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
+                List of hidden states from the backbone.
+        """
+        if not isinstance(hidden_states, (tuple, list)):
+            raise TypeError("hidden_states should be a tuple or list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        if self.reassemble_stage is not None:
+            hidden_states = self.reassemble_stage(hidden_states, patch_height, patch_width)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(hidden_states)]
+
+        # fusion blocks
+        output = self.fusion_stage(features)
+
+        return output, features[-1]
+
+
+class ZoeDepthRelativeDepthEstimationHead(nn.Module):
+    """
+    Relative depth estimation head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in DPT's paper's
+    supplementary material).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_in_index = config.head_in_index
+
+        self.projection = None
+        if config.add_projection:
+            self.projection = nn.Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+
+        features = config.fusion_hidden_size
+        self.conv1 = nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1)
+        self.upsample = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
+        self.conv2 = nn.Conv2d(features // 2, config.num_relative_features, kernel_size=3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(config.num_relative_features, 1, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states: List[torch.Tensor]) -> torch.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.head_in_index]
+
+        if self.projection is not None:
+            hidden_states = self.projection(hidden_states)
+            hidden_states = nn.ReLU()(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.upsample(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = nn.ReLU()(hidden_states)
+        # we need the features here (after second conv + ReLu)
+        features = hidden_states
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = nn.ReLU()(hidden_states)
+
+        predicted_depth = hidden_states.squeeze(dim=1)
+
+        return predicted_depth, features
+
+
+def log_binom(n, k, eps=1e-7):
+    """log(nCk) using stirling approximation"""
+    n = n + eps
+    k = k + eps
+    return n * torch.log(n) - k * torch.log(k) - (n - k) * torch.log(n - k + eps)
+
+
+class LogBinomialSoftmax(nn.Module):
+    def __init__(self, n_classes=256, act=torch.softmax):
+        """Compute log binomial distribution for n_classes
+
+        Args:
+            n_classes (`int`, *optional*, defaults to 256):
+                Number of output classes.
+            act (`torch.nn.Module`, *optional*, defaults to `torch.softmax`):
+                Activation function to apply to the output.
+        """
+        super().__init__()
+        self.k = n_classes
+        self.act = act
+        self.register_buffer("k_idx", torch.arange(0, n_classes).view(1, -1, 1, 1), persistent=False)
+        self.register_buffer("k_minus_1", torch.tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False)
+
+    def forward(self, probabilities, temperature=1.0, eps=1e-4):
+        """Compute the log binomial distribution for probabilities.
+
+        Args:
+            probabilities (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Tensor containing probabilities of each class.
+            temperature (`float` or `torch.Tensor` of shape `(batch_size, num_channels, height, width)`, *optional*, defaults to 1):
+                Temperature of distribution.
+            eps (`float`, *optional*, defaults to 1e-4):
+                Small number for numerical stability.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_channels, height, width)`:
+                Log binomial distribution logbinomial(p;t).
+        """
+        if probabilities.ndim == 3:
+            probabilities = probabilities.unsqueeze(1)  # make it (batch_size, num_channels, height, width)
+
+        one_minus_probabilities = torch.clamp(1 - probabilities, eps, 1)
+        probabilities = torch.clamp(probabilities, eps, 1)
+        y = (
+            log_binom(self.k_minus_1, self.k_idx)
+            + self.k_idx * torch.log(probabilities)
+            + (self.k_minus_1 - self.k_idx) * torch.log(one_minus_probabilities)
+        )
+        return self.act(y / temperature, dim=1)
+
+
+class ZoeDepthConditionalLogBinomialSoftmax(nn.Module):
+    def __init__(
+        self,
+        config,
+        in_features,
+        condition_dim,
+        n_classes=256,
+        bottleneck_factor=2,
+    ):
+        """Per-pixel MLP followed by a Conditional Log Binomial softmax.
+
+        Args:
+            in_features (`int`):
+                Number of input channels in the main feature.
+            condition_dim (`int`):
+                Number of input channels in the condition feature.
+            n_classes (`int`, *optional*, defaults to 256):
+                Number of classes.
+            bottleneck_factor (`int`, *optional*, defaults to 2):
+                Hidden dim factor.
+
+        """
+        super().__init__()
+
+        bottleneck = (in_features + condition_dim) // bottleneck_factor
+        self.mlp = nn.Sequential(
+            nn.Conv2d(in_features + condition_dim, bottleneck, kernel_size=1, stride=1, padding=0),
+            nn.GELU(),
+            # 2 for probabilities linear norm, 2 for temperature linear norm
+            nn.Conv2d(bottleneck, 2 + 2, kernel_size=1, stride=1, padding=0),
+            nn.Softplus(),
+        )
+
+        self.p_eps = 1e-4
+        self.max_temp = config.max_temp
+        self.min_temp = config.min_temp
+        self.log_binomial_transform = LogBinomialSoftmax(n_classes, act=torch.softmax)
+
+    def forward(self, main_feature, condition_feature):
+        """
+        Args:
+            main_feature (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Main feature.
+            condition_feature (torch.Tensor of shape `(batch_size, num_channels, height, width)`):
+                Condition feature.
+
+        Returns:
+            `torch.Tensor`:
+                Output log binomial distribution
+        """
+        probabilities_and_temperature = self.mlp(torch.concat((main_feature, condition_feature), dim=1))
+        probabilities, temperature = (
+            probabilities_and_temperature[:, :2, ...],
+            probabilities_and_temperature[:, 2:, ...],
+        )
+
+        probabilities = probabilities + self.p_eps
+        probabilities = probabilities[:, 0, ...] / (probabilities[:, 0, ...] + probabilities[:, 1, ...])
+
+        temperature = temperature + self.p_eps
+        temperature = temperature[:, 0, ...] / (temperature[:, 0, ...] + temperature[:, 1, ...])
+        temperature = temperature.unsqueeze(1)
+        temperature = (self.max_temp - self.min_temp) * temperature + self.min_temp
+
+        return self.log_binomial_transform(probabilities, temperature)
+
+
+class ZoeDepthSeedBinRegressor(nn.Module):
+    def __init__(self, config, n_bins=16, mlp_dim=256, min_depth=1e-3, max_depth=10):
+        """Bin center regressor network.
+
+        Can be "normed" or "unnormed". If "normed", bin centers are bounded on the (min_depth, max_depth) interval.
+
+        Args:
+            config (`int`):
+                Model configuration.
+            n_bins (`int`, *optional*, defaults to 16):
+                Number of bin centers.
+            mlp_dim (`int`, *optional*, defaults to 256):
+                Hidden dimension.
+            min_depth (`float`, *optional*, defaults to 1e-3):
+                Min depth value.
+            max_depth (`float`, *optional*, defaults to 10):
+                Max depth value.
+        """
+        super().__init__()
+
+        self.in_features = config.bottleneck_features
+        self.bin_centers_type = config.bin_centers_type
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+
+        self.conv1 = nn.Conv2d(self.in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_bins, 1, 1, 0)
+        self.act2 = nn.ReLU(inplace=True) if self.bin_centers_type == "normed" else nn.Softplus()
+
+    def forward(self, x):
+        """
+        Returns tensor of bin_width vectors (centers). One vector b for every pixel
+        """
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        bin_centers = self.act2(x)
+
+        if self.bin_centers_type == "normed":
+            bin_centers = bin_centers + 1e-3
+            bin_widths_normed = bin_centers / bin_centers.sum(dim=1, keepdim=True)
+            # shape (batch_size, num_channels, height, width)
+            bin_widths = (self.max_depth - self.min_depth) * bin_widths_normed
+            # pad has the form (left, right, top, bottom, front, back)
+            bin_widths = nn.functional.pad(bin_widths, (0, 0, 0, 0, 1, 0), mode="constant", value=self.min_depth)
+            # shape (batch_size, num_channels, height, width)
+            bin_edges = torch.cumsum(bin_widths, dim=1)
+
+            bin_centers = 0.5 * (bin_edges[:, :-1, ...] + bin_edges[:, 1:, ...])
+            return bin_widths_normed, bin_centers
+
+        else:
+            return bin_centers, bin_centers
+
+
+@torch.jit.script
+def inv_attractor(dx, alpha: float = 300, gamma: int = 2):
+    """Inverse attractor: dc = dx / (1 + alpha*dx^gamma), where dx = a - c, a = attractor point, c = bin center, dc = shift in bin center
+    This is the default one according to the accompanying paper.
+
+    Args:
+        dx (`torch.Tensor`):
+            The difference tensor dx = Ai - Cj, where Ai is the attractor point and Cj is the bin center.
+        alpha (`float`, *optional*, defaults to 300):
+            Proportional Attractor strength. Determines the absolute strength. Lower alpha = greater attraction.
+        gamma (`int`, *optional*, defaults to 2):
+            Exponential Attractor strength. Determines the "region of influence" and indirectly number of bin centers affected.
+            Lower gamma = farther reach.
+
+    Returns:
+        torch.Tensor: Delta shifts - dc; New bin centers = Old bin centers + dc
+    """
+    return dx.div(1 + alpha * dx.pow(gamma))
+
+
+class ZoeDepthAttractorLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_bins,
+        n_attractors=16,
+        min_depth=1e-3,
+        max_depth=10,
+        memory_efficient=False,
+    ):
+        """
+        Attractor layer for bin centers. Bin centers are bounded on the interval (min_depth, max_depth)
+        """
+        super().__init__()
+
+        self.alpha = config.attractor_alpha
+        self.gemma = config.attractor_gamma
+        self.kind = config.attractor_kind
+
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.memory_efficient = memory_efficient
+
+        # MLP to predict attractor points
+        in_features = mlp_dim = config.bin_embedding_dim
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_attractors * 2, 1, 1, 0)  # x2 for linear norm
+        self.act2 = nn.ReLU(inplace=True)
+
+    def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
+        """
+        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
+        and the attractor points (the latter are predicted by the MLP).
+
+        Args:
+            x (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Feature block.
+            prev_bin (`torch.Tensor` of shape `(batch_size, prev_number_of_bins, height, width)`):
+                Previous bin centers normed.
+            prev_bin_embedding (`torch.Tensor`, *optional*):
+                Optional previous bin embeddings.
+            interpolate (`bool`, *optional*, defaults to `True`):
+                Whether to interpolate the previous bin embeddings to the size of the input features.
+
+        Returns:
+            `Tuple[`torch.Tensor`, `torch.Tensor`]:
+                New bin centers normed and scaled.
+        """
+        if prev_bin_embedding is not None:
+            if interpolate:
+                prev_bin_embedding = nn.functional.interpolate(
+                    prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
+                )
+            x = x + prev_bin_embedding
+
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        attractors = self.act2(x)
+
+        attractors = attractors + 1e-3
+        batch_size, _, height, width = attractors.shape
+        attractors = attractors.view(batch_size, self.n_attractors, 2, height, width)
+        # batch_size, num_attractors, 2, height, width
+        # note: original repo had a bug here: https://github.com/isl-org/ZoeDepth/blame/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/layers/attractor.py#L105C9-L106C50
+        # we include the bug to maintain compatibility with the weights
+        attractors_normed = attractors[:, :, 0, ...]  # batch_size, batch_size*num_attractors, height, width
+
+        bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
+
+        # note: only attractor_type = "exp" is supported here, since no checkpoints were released with other attractor types
+
+        if not self.memory_efficient:
+            func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
+            # shape (batch_size, num_bins, height, width)
+            delta_c = func(inv_attractor(attractors_normed.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
+            for i in range(self.n_attractors):
+                # shape (batch_size, num_bins, height, width)
+                delta_c += inv_attractor(attractors_normed[:, i, ...].unsqueeze(1) - bin_centers)
+
+            if self.kind == "mean":
+                delta_c = delta_c / self.n_attractors
+
+        bin_new_centers = bin_centers + delta_c
+        bin_centers = (self.max_depth - self.min_depth) * bin_new_centers + self.min_depth
+        bin_centers, _ = torch.sort(bin_centers, dim=1)
+        bin_centers = torch.clip(bin_centers, self.min_depth, self.max_depth)
+        return bin_new_centers, bin_centers
+
+
+class ZoeDepthAttractorLayerUnnormed(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_bins,
+        n_attractors=16,
+        min_depth=1e-3,
+        max_depth=10,
+        memory_efficient=True,
+    ):
+        """
+        Attractor layer for bin centers. Bin centers are unbounded
+        """
+        super().__init__()
+
+        self.n_attractors = n_attractors
+        self.n_bins = n_bins
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.alpha = config.attractor_alpha
+        self.gamma = config.attractor_alpha
+        self.kind = config.attractor_kind
+        self.memory_efficient = memory_efficient
+
+        in_features = mlp_dim = config.bin_embedding_dim
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, n_attractors, 1, 1, 0)
+        self.act2 = nn.Softplus()
+
+    def forward(self, x, prev_bin, prev_bin_embedding=None, interpolate=True):
+        """
+        The forward pass of the attractor layer. This layer predicts the new bin centers based on the previous bin centers
+        and the attractor points (the latter are predicted by the MLP).
+
+        Args:
+            x (`torch.Tensor` of shape (batch_size, num_channels, height, width)`):
+                Feature block.
+            prev_bin (`torch.Tensor` of shape (batch_size, prev_num_bins, height, width)`):
+                Previous bin centers normed.
+            prev_bin_embedding (`torch.Tensor`, *optional*):
+                Optional previous bin embeddings.
+            interpolate (`bool`, *optional*, defaults to `True`):
+                Whether to interpolate the previous bin embeddings to the size of the input features.
+
+        Returns:
+            `Tuple[`torch.Tensor`, `torch.Tensor`]:
+                New bin centers unbounded. Two outputs just to keep the API consistent with the normed version.
+        """
+        if prev_bin_embedding is not None:
+            if interpolate:
+                prev_bin_embedding = nn.functional.interpolate(
+                    prev_bin_embedding, x.shape[-2:], mode="bilinear", align_corners=True
+                )
+            x = x + prev_bin_embedding
+
+        x = self.conv1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        attractors = self.act2(x)
+
+        height, width = attractors.shape[-2:]
+
+        bin_centers = nn.functional.interpolate(prev_bin, (height, width), mode="bilinear", align_corners=True)
+
+        if not self.memory_efficient:
+            func = {"mean": torch.mean, "sum": torch.sum}[self.kind]
+            # shape batch_size, num_bins, height, width
+            delta_c = func(inv_attractor(attractors.unsqueeze(2) - bin_centers.unsqueeze(1)), dim=1)
+        else:
+            delta_c = torch.zeros_like(bin_centers, device=bin_centers.device)
+            for i in range(self.n_attractors):
+                # shape batch_size, num_bins, height, width
+                delta_c += inv_attractor(attractors[:, i, ...].unsqueeze(1) - bin_centers)
+
+            if self.kind == "mean":
+                delta_c = delta_c / self.n_attractors
+
+        bin_new_centers = bin_centers + delta_c
+        bin_centers = bin_new_centers
+
+        return bin_new_centers, bin_centers
+
+
+class ZoeDepthProjector(nn.Module):
+    def __init__(self, in_features, out_features, mlp_dim=128):
+        """Projector MLP.
+
+        Args:
+            in_features (`int`):
+                Number of input channels.
+            out_features (`int`):
+                Number of output channels.
+            mlp_dim (`int`, *optional*, defaults to 128):
+                Hidden dimension.
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(in_features, mlp_dim, 1, 1, 0)
+        self.act = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(mlp_dim, out_features, 1, 1, 0)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv1(hidden_state)
+        hidden_state = self.act(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+# Copied from transformers.models.grounding_dino.modeling_grounding_dino.GroundingDinoMultiheadAttention with GroundingDino->ZoeDepth
+class ZoeDepthMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
+
+    # Ignore copy
+    def __init__(self, hidden_size, num_attention_heads, dropout):
+        super().__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({hidden_size}) is not a multiple of the number of attention "
+                f"heads ({num_attention_heads})"
+            )
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(dropout)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(queries))
+        key_layer = self.transpose_for_scores(self.key(keys))
+        value_layer = self.transpose_for_scores(self.value(values))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ZoeDepthModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ZoeDepthTransformerEncoderLayer(nn.Module):
+    def __init__(self, config, dropout=0.1, activation="relu"):
+        super().__init__()
+
+        hidden_size = config.patch_transformer_hidden_size
+        intermediate_size = config.patch_transformer_intermediate_size
+        num_attention_heads = config.patch_transformer_num_attention_heads
+
+        self.self_attn = ZoeDepthMultiheadAttention(hidden_size, num_attention_heads, dropout=dropout)
+
+        self.linear1 = nn.Linear(hidden_size, intermediate_size)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(intermediate_size, hidden_size)
+
+        self.norm1 = nn.LayerNorm(hidden_size)
+        self.norm2 = nn.LayerNorm(hidden_size)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = ACT2FN[activation]
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[torch.Tensor] = None,
+    ):
+        queries = keys = src
+        src2 = self.self_attn(queries=queries, keys=keys, values=src, attention_mask=src_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+
+class ZoeDepthPatchTransformerEncoder(nn.Module):
+    def __init__(self, config):
+        """ViT-like transformer block
+
+        Args:
+            config (`ZoeDepthConfig`):
+                Model configuration class defining the model architecture.
+        """
+        super().__init__()
+
+        in_channels = config.bottleneck_features
+
+        self.transformer_encoder = nn.ModuleList(
+            [ZoeDepthTransformerEncoderLayer(config) for _ in range(config.num_patch_transformer_layers)]
+        )
+
+        self.embedding_convPxP = nn.Conv2d(
+            in_channels, config.patch_transformer_hidden_size, kernel_size=1, stride=1, padding=0
+        )
+
+    def positional_encoding_1d(self, batch_size, sequence_length, embedding_dim, device="cpu", dtype=torch.float32):
+        """Generate positional encodings
+
+        Args:
+            sequence_length (int): Sequence length
+            embedding_dim (int): Embedding dimension
+
+        Returns:
+            torch.Tensor: Positional encodings.
+        """
+        position = torch.arange(0, sequence_length, dtype=dtype, device=device).unsqueeze(1)
+        index = torch.arange(0, embedding_dim, 2, dtype=dtype, device=device).unsqueeze(0)
+        div_term = torch.exp(index * (-torch.log(torch.tensor(10000.0, device=device)) / embedding_dim))
+        pos_encoding = position * div_term
+        pos_encoding = torch.cat([torch.sin(pos_encoding), torch.cos(pos_encoding)], dim=1)
+        pos_encoding = pos_encoding.unsqueeze(dim=0).repeat(batch_size, 1, 1)
+        return pos_encoding
+
+    def forward(self, x):
+        """Forward pass
+
+        Args:
+            x (torch.Tensor - NCHW): Input feature tensor
+
+        Returns:
+            torch.Tensor - Transformer output embeddings of shape (batch_size, sequence_length, embedding_dim)
+        """
+        embeddings = self.embedding_convPxP(x).flatten(2)  # shape (batch_size, num_channels, sequence_length)
+        # add an extra special CLS token at the start for global accumulation
+        embeddings = nn.functional.pad(embeddings, (1, 0))
+
+        embeddings = embeddings.permute(0, 2, 1)
+        batch_size, sequence_length, embedding_dim = embeddings.shape
+        embeddings = embeddings + self.positional_encoding_1d(
+            batch_size, sequence_length, embedding_dim, device=embeddings.device, dtype=embeddings.dtype
+        )
+
+        for i in range(4):
+            embeddings = self.transformer_encoder[i](embeddings)
+
+        return embeddings
+
+
+class ZoeDepthMLPClassifier(nn.Module):
+    def __init__(self, in_features, out_features) -> None:
+        super().__init__()
+
+        hidden_features = in_features
+        self.linear1 = nn.Linear(in_features, hidden_features)
+        self.activation = nn.ReLU()
+        self.linear2 = nn.Linear(hidden_features, out_features)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        domain_logits = self.linear2(hidden_state)
+
+        return domain_logits
+
+
+class ZoeDepthMultipleMetricDepthEstimationHeads(nn.Module):
+    """
+    Multiple metric depth estimation heads. A MLP classifier is used to route between 2 different heads.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        bin_embedding_dim = config.bin_embedding_dim
+        n_attractors = config.num_attractors
+        self.bin_configurations = config.bin_configurations
+        self.bin_centers_type = config.bin_centers_type
+
+        # Bottleneck convolution
+        bottleneck_features = config.bottleneck_features
+        self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
+
+        # Transformer classifier on the bottleneck
+        self.patch_transformer = ZoeDepthPatchTransformerEncoder(config)
+        # MLP classifier
+        self.mlp_classifier = ZoeDepthMLPClassifier(in_features=128, out_features=2)
+
+        # Regressor and attractor
+        if self.bin_centers_type == "normed":
+            Attractor = ZoeDepthAttractorLayer
+        elif self.bin_centers_type == "softplus":
+            Attractor = ZoeDepthAttractorLayerUnnormed
+        # We have bins for each bin configuration
+        # Create a map (ModuleDict) of 'name' -> seed_bin_regressor
+        self.seed_bin_regressors = nn.ModuleDict(
+            {
+                conf["name"]: ZoeDepthSeedBinRegressor(
+                    config,
+                    n_bins=conf["n_bins"],
+                    mlp_dim=bin_embedding_dim // 2,
+                    min_depth=conf["min_depth"],
+                    max_depth=conf["max_depth"],
+                )
+                for conf in config.bin_configurations
+            }
+        )
+
+        self.seed_projector = ZoeDepthProjector(
+            in_features=bottleneck_features, out_features=bin_embedding_dim, mlp_dim=bin_embedding_dim // 2
+        )
+        self.projectors = nn.ModuleList(
+            [
+                ZoeDepthProjector(
+                    in_features=config.fusion_hidden_size,
+                    out_features=bin_embedding_dim,
+                    mlp_dim=bin_embedding_dim // 2,
+                )
+                for _ in range(4)
+            ]
+        )
+
+        # Create a map (ModuleDict) of 'name' -> attractors (ModuleList)
+        self.attractors = nn.ModuleDict(
+            {
+                configuration["name"]: nn.ModuleList(
+                    [
+                        Attractor(
+                            config,
+                            n_bins=n_attractors[i],
+                            min_depth=configuration["min_depth"],
+                            max_depth=configuration["max_depth"],
+                        )
+                        for i in range(len(n_attractors))
+                    ]
+                )
+                for configuration in config.bin_configurations
+            }
+        )
+
+        last_in = config.num_relative_features
+        # conditional log binomial for each bin configuration
+        self.conditional_log_binomial = nn.ModuleDict(
+            {
+                configuration["name"]: ZoeDepthConditionalLogBinomialSoftmax(
+                    config,
+                    last_in,
+                    bin_embedding_dim,
+                    configuration["n_bins"],
+                    bottleneck_factor=4,
+                )
+                for configuration in config.bin_configurations
+            }
+        )
+
+    def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
+        x = self.conv2(bottleneck)
+
+        # Predict which path to take
+        # Embedding is of shape (batch_size, hidden_size)
+        embedding = self.patch_transformer(x)[:, 0, :]
+
+        # MLP classifier to get logits of shape (batch_size, 2)
+        domain_logits = self.mlp_classifier(embedding)
+        domain_vote = torch.softmax(domain_logits.sum(dim=0, keepdim=True), dim=-1)
+
+        # Get the path
+        names = [configuration["name"] for configuration in self.bin_configurations]
+        bin_configurations_name = names[torch.argmax(domain_vote, dim=-1).squeeze().item()]
+
+        try:
+            conf = [config for config in self.bin_configurations if config["name"] == bin_configurations_name][0]
+        except IndexError:
+            raise ValueError(f"bin_configurations_name {bin_configurations_name} not found in bin_configurationss")
+
+        min_depth = conf["min_depth"]
+        max_depth = conf["max_depth"]
+
+        seed_bin_regressor = self.seed_bin_regressors[bin_configurations_name]
+        _, seed_bin_centers = seed_bin_regressor(x)
+        if self.bin_centers_type in ["normed", "hybrid2"]:
+            prev_bin = (seed_bin_centers - min_depth) / (max_depth - min_depth)
+        else:
+            prev_bin = seed_bin_centers
+        prev_bin_embedding = self.seed_projector(x)
+
+        attractors = self.attractors[bin_configurations_name]
+        for projector, attractor, feature in zip(self.projectors, attractors, feature_blocks):
+            bin_embedding = projector(feature)
+            bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
+            prev_bin = bin
+            prev_bin_embedding = bin_embedding
+
+        last = outconv_activation
+
+        bin_centers = nn.functional.interpolate(bin_centers, last.shape[-2:], mode="bilinear", align_corners=True)
+        bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
+
+        conditional_log_binomial = self.conditional_log_binomial[bin_configurations_name]
+        x = conditional_log_binomial(last, bin_embedding)
+
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        out = torch.sum(x * bin_centers, dim=1, keepdim=True)
+
+        return out, domain_logits
+
+
+class ZoeDepthMetricDepthEstimationHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        bin_configuration = config.bin_configurations[0]
+        n_bins = bin_configuration["n_bins"]
+        min_depth = bin_configuration["min_depth"]
+        max_depth = bin_configuration["max_depth"]
+        bin_embedding_dim = config.bin_embedding_dim
+        n_attractors = config.num_attractors
+        bin_centers_type = config.bin_centers_type
+
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.bin_centers_type = bin_centers_type
+
+        # Bottleneck convolution
+        bottleneck_features = config.bottleneck_features
+        self.conv2 = nn.Conv2d(bottleneck_features, bottleneck_features, kernel_size=1, stride=1, padding=0)
+
+        # Regressor and attractor
+        if self.bin_centers_type == "normed":
+            Attractor = ZoeDepthAttractorLayer
+        elif self.bin_centers_type == "softplus":
+            Attractor = ZoeDepthAttractorLayerUnnormed
+
+        self.seed_bin_regressor = ZoeDepthSeedBinRegressor(
+            config, n_bins=n_bins, min_depth=min_depth, max_depth=max_depth
+        )
+        self.seed_projector = ZoeDepthProjector(in_features=bottleneck_features, out_features=bin_embedding_dim)
+
+        self.projectors = nn.ModuleList(
+            [
+                ZoeDepthProjector(in_features=config.fusion_hidden_size, out_features=bin_embedding_dim)
+                for _ in range(4)
+            ]
+        )
+        self.attractors = nn.ModuleList(
+            [
+                Attractor(
+                    config,
+                    n_bins=n_bins,
+                    n_attractors=n_attractors[i],
+                    min_depth=min_depth,
+                    max_depth=max_depth,
+                )
+                for i in range(4)
+            ]
+        )
+
+        last_in = config.num_relative_features + 1  # +1 for relative depth
+
+        # use log binomial instead of softmax
+        self.conditional_log_binomial = ZoeDepthConditionalLogBinomialSoftmax(
+            config,
+            last_in,
+            bin_embedding_dim,
+            n_classes=n_bins,
+        )
+
+    def forward(self, outconv_activation, bottleneck, feature_blocks, relative_depth):
+        x = self.conv2(bottleneck)
+        _, seed_bin_centers = self.seed_bin_regressor(x)
+
+        if self.bin_centers_type in ["normed", "hybrid2"]:
+            prev_bin = (seed_bin_centers - self.min_depth) / (self.max_depth - self.min_depth)
+        else:
+            prev_bin = seed_bin_centers
+
+        prev_bin_embedding = self.seed_projector(x)
+
+        # unroll this loop for better performance
+        for projector, attractor, feature in zip(self.projectors, self.attractors, feature_blocks):
+            bin_embedding = projector(feature)
+            bin, bin_centers = attractor(bin_embedding, prev_bin, prev_bin_embedding, interpolate=True)
+            prev_bin = bin.clone()
+            prev_bin_embedding = bin_embedding.clone()
+
+        last = outconv_activation
+
+        # concatenative relative depth with last. First interpolate relative depth to last size
+        relative_conditioning = relative_depth.unsqueeze(1)
+        relative_conditioning = nn.functional.interpolate(
+            relative_conditioning, size=last.shape[2:], mode="bilinear", align_corners=True
+        )
+        last = torch.cat([last, relative_conditioning], dim=1)
+
+        bin_embedding = nn.functional.interpolate(bin_embedding, last.shape[-2:], mode="bilinear", align_corners=True)
+        x = self.conditional_log_binomial(last, bin_embedding)
+
+        # Now depth value is Sum px * cx , where cx are bin_centers from the last bin tensor
+        bin_centers = nn.functional.interpolate(bin_centers, x.shape[-2:], mode="bilinear", align_corners=True)
+        out = torch.sum(x * bin_centers, dim=1, keepdim=True)
+
+        return out, None
+
+
+# Modified from transformers.models.dpt.modeling_dpt.DPTPreTrainedModel with DPT->ZoeDepth,dpt->zoedepth
+# avoiding sdpa and flash_attn_2 support, it's done int the backend
+class ZoeDepthPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ZoeDepthConfig
+    base_model_prefix = "zoedepth"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ZOEDEPTH_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ZOEDEPTH_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`DPTImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """
+    ZoeDepth model with one or multiple metric depth estimation head(s) on top.
+    """,
+    ZOEDEPTH_START_DOCSTRING,
+)
+class ZoeDepthForDepthEstimation(ZoeDepthPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.backbone = load_backbone(config)
+
+        if hasattr(self.backbone.config, "hidden_size") and hasattr(self.backbone.config, "patch_size"):
+            config.backbone_hidden_size = self.backbone.config.hidden_size
+            self.patch_size = self.backbone.config.patch_size
+        else:
+            raise ValueError(
+                "ZoeDepth assumes the backbone's config to have `hidden_size` and `patch_size` attributes"
+            )
+
+        self.neck = ZoeDepthNeck(config)
+        self.relative_head = ZoeDepthRelativeDepthEstimationHead(config)
+
+        self.metric_head = (
+            ZoeDepthMultipleMetricDepthEstimationHeads(config)
+            if len(config.bin_configurations) > 1
+            else ZoeDepthMetricDepthEstimationHead(config)
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ZOEDEPTH_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DepthEstimatorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], DepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+        >>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = image_processor.post_process_depth_estimation(
+        ...     outputs,
+        ...     source_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.backbone.forward_with_filtered_kwargs(
+            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+        hidden_states = outputs.feature_maps
+
+        _, _, height, width = pixel_values.shape
+        patch_size = self.patch_size
+        patch_height = height // patch_size
+        patch_width = width // patch_size
+
+        hidden_states, features = self.neck(hidden_states, patch_height, patch_width)
+
+        out = [features] + hidden_states
+
+        relative_depth, features = self.relative_head(hidden_states)
+
+        out = [features] + out
+
+        metric_depth, domain_logits = self.metric_head(
+            outconv_activation=out[0], bottleneck=out[1], feature_blocks=out[2:], relative_depth=relative_depth
+        )
+        metric_depth = metric_depth.squeeze(dim=1)
+
+        if not return_dict:
+            if domain_logits is not None:
+                output = (metric_depth, domain_logits) + outputs[1:]
+            else:
+                output = (metric_depth,) + outputs[1:]
+
+            return ((loss,) + output) if loss is not None else output
+
+        return ZoeDepthDepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=metric_depth,
+            domain_logits=domain_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ZoeDepthForDepthEstimation", "ZoeDepthPreTrainedModel"]
diff --git a/docs/transformers/build/lib/transformers/onnx/__init__.py b/docs/transformers/build/lib/transformers/onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..33350c83a2c161ee228677e8f6fc4b495e9c05bb
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/__init__.py
@@ -0,0 +1,49 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import _LazyModule
+
+
+_import_structure = {
+    "config": [
+        "EXTERNAL_DATA_FORMAT_SIZE_LIMIT",
+        "OnnxConfig",
+        "OnnxConfigWithPast",
+        "OnnxSeq2SeqConfigWithPast",
+        "PatchingSpec",
+    ],
+    "convert": ["export", "validate_model_outputs"],
+    "features": ["FeaturesManager"],
+    "utils": ["ParameterFormat", "compute_serialized_parameters_size"],
+}
+
+
+if TYPE_CHECKING:
+    from .config import (
+        EXTERNAL_DATA_FORMAT_SIZE_LIMIT,
+        OnnxConfig,
+        OnnxConfigWithPast,
+        OnnxSeq2SeqConfigWithPast,
+        PatchingSpec,
+    )
+    from .convert import export, validate_model_outputs
+    from .features import FeaturesManager
+    from .utils import ParameterFormat, compute_serialized_parameters_size
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/docs/transformers/build/lib/transformers/onnx/__main__.py b/docs/transformers/build/lib/transformers/onnx/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3dc6dfb78aa16fa32ea4a30eb4933a0653c7264
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/__main__.py
@@ -0,0 +1,242 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+import sys
+import warnings
+from argparse import ArgumentParser
+from pathlib import Path
+
+from packaging import version
+
+from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer
+from ..utils import logging
+from ..utils.import_utils import is_optimum_available
+from .convert import export, validate_model_outputs
+from .features import FeaturesManager
+from .utils import get_preprocessor
+
+
+MIN_OPTIMUM_VERSION = "1.5.0"
+
+ENCODER_DECODER_MODELS = ["vision-encoder-decoder"]
+
+
+def export_with_optimum(args):
+    if is_optimum_available():
+        from optimum.version import __version__ as optimum_version
+
+        parsed_optimum_version = version.parse(optimum_version)
+        if parsed_optimum_version < version.parse(MIN_OPTIMUM_VERSION):
+            raise RuntimeError(
+                f"transformers.onnx requires optimum >= {MIN_OPTIMUM_VERSION} but {optimum_version} is installed. You "
+                "can upgrade optimum by running: pip install -U optimum[exporters]"
+            )
+    else:
+        raise RuntimeError(
+            "transformers.onnx requires optimum to run, you can install the library by running: pip install "
+            "optimum[exporters]"
+        )
+    cmd_line = [
+        sys.executable,
+        "-m",
+        "optimum.exporters.onnx",
+        f"--model {args.model}",
+        f"--task {args.feature}",
+        f"--framework {args.framework}" if args.framework is not None else "",
+        f"{args.output}",
+    ]
+    proc = subprocess.Popen(cmd_line, stdout=subprocess.PIPE)
+    proc.wait()
+
+    logger.info(
+        "The export was done by optimum.exporters.onnx. We recommend using to use this package directly in future, as "
+        "transformers.onnx is deprecated, and will be removed in v5. You can find more information here: "
+        "https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model."
+    )
+
+
+def export_with_transformers(args):
+    args.output = args.output if args.output.is_file() else args.output.joinpath("model.onnx")
+    if not args.output.parent.exists():
+        args.output.parent.mkdir(parents=True)
+
+    # Allocate the model
+    model = FeaturesManager.get_model_from_feature(
+        args.feature, args.model, framework=args.framework, cache_dir=args.cache_dir
+    )
+
+    model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=args.feature)
+    onnx_config = model_onnx_config(model.config)
+
+    if model_kind in ENCODER_DECODER_MODELS:
+        encoder_model = model.get_encoder()
+        decoder_model = model.get_decoder()
+
+        encoder_onnx_config = onnx_config.get_encoder_config(encoder_model.config)
+        decoder_onnx_config = onnx_config.get_decoder_config(
+            encoder_model.config, decoder_model.config, feature=args.feature
+        )
+
+        if args.opset is None:
+            args.opset = max(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)
+
+        if args.opset < min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset):
+            raise ValueError(
+                f"Opset {args.opset} is not sufficient to export {model_kind}. At least "
+                f" {min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)} is required."
+            )
+
+        preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
+
+        onnx_inputs, onnx_outputs = export(
+            preprocessor,
+            encoder_model,
+            encoder_onnx_config,
+            args.opset,
+            args.output.parent.joinpath("encoder_model.onnx"),
+        )
+
+        validate_model_outputs(
+            encoder_onnx_config,
+            preprocessor,
+            encoder_model,
+            args.output.parent.joinpath("encoder_model.onnx"),
+            onnx_outputs,
+            args.atol if args.atol else encoder_onnx_config.atol_for_validation,
+        )
+
+        preprocessor = AutoTokenizer.from_pretrained(args.model)
+
+        onnx_inputs, onnx_outputs = export(
+            preprocessor,
+            decoder_model,
+            decoder_onnx_config,
+            args.opset,
+            args.output.parent.joinpath("decoder_model.onnx"),
+        )
+
+        validate_model_outputs(
+            decoder_onnx_config,
+            preprocessor,
+            decoder_model,
+            args.output.parent.joinpath("decoder_model.onnx"),
+            onnx_outputs,
+            args.atol if args.atol else decoder_onnx_config.atol_for_validation,
+        )
+        logger.info(
+            f"All good, model saved at: {args.output.parent.joinpath('encoder_model.onnx').as_posix()},"
+            f" {args.output.parent.joinpath('decoder_model.onnx').as_posix()}"
+        )
+
+    else:
+        # Instantiate the appropriate preprocessor
+        if args.preprocessor == "auto":
+            preprocessor = get_preprocessor(args.model)
+        elif args.preprocessor == "tokenizer":
+            preprocessor = AutoTokenizer.from_pretrained(args.model)
+        elif args.preprocessor == "image_processor":
+            preprocessor = AutoImageProcessor.from_pretrained(args.model)
+        elif args.preprocessor == "feature_extractor":
+            preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
+        elif args.preprocessor == "processor":
+            preprocessor = AutoProcessor.from_pretrained(args.model)
+        else:
+            raise ValueError(f"Unknown preprocessor type '{args.preprocessor}'")
+
+        # Ensure the requested opset is sufficient
+        if args.opset is None:
+            args.opset = onnx_config.default_onnx_opset
+
+        if args.opset < onnx_config.default_onnx_opset:
+            raise ValueError(
+                f"Opset {args.opset} is not sufficient to export {model_kind}. "
+                f"At least  {onnx_config.default_onnx_opset} is required."
+            )
+
+        onnx_inputs, onnx_outputs = export(
+            preprocessor,
+            model,
+            onnx_config,
+            args.opset,
+            args.output,
+        )
+
+        if args.atol is None:
+            args.atol = onnx_config.atol_for_validation
+
+        validate_model_outputs(onnx_config, preprocessor, model, args.output, onnx_outputs, args.atol)
+        logger.info(f"All good, model saved at: {args.output.as_posix()}")
+        warnings.warn(
+            "The export was done by transformers.onnx which is deprecated and will be removed in v5. We recommend"
+            " using optimum.exporters.onnx in future. You can find more information here:"
+            " https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model.",
+            FutureWarning,
+        )
+
+
+def main():
+    parser = ArgumentParser("Hugging Face Transformers ONNX exporter")
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
+    )
+    parser.add_argument(
+        "--feature",
+        default="default",
+        help="The type of features to export the model with.",
+    )
+    parser.add_argument("--opset", type=int, default=None, help="ONNX opset version to export the model with.")
+    parser.add_argument(
+        "--atol", type=float, default=None, help="Absolute difference tolerance when validating the model."
+    )
+    parser.add_argument(
+        "--framework",
+        type=str,
+        choices=["pt", "tf"],
+        default=None,
+        help=(
+            "The framework to use for the ONNX export."
+            " If not provided, will attempt to use the local checkpoint's original framework"
+            " or what is available in the environment."
+        ),
+    )
+    parser.add_argument("output", type=Path, help="Path indicating where to store generated ONNX model.")
+    parser.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    parser.add_argument(
+        "--preprocessor",
+        type=str,
+        choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
+        default="auto",
+        help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
+    )
+    parser.add_argument(
+        "--export_with_transformers",
+        action="store_true",
+        help=(
+            "Whether to use transformers.onnx instead of optimum.exporters.onnx to perform the ONNX export. It can be "
+            "useful when exporting a model supported in transformers but not in optimum, otherwise it is not "
+            "recommended."
+        ),
+    )
+
+    args = parser.parse_args()
+    if args.export_with_transformers or not is_optimum_available():
+        export_with_transformers(args)
+    else:
+        export_with_optimum(args)
+
+
+if __name__ == "__main__":
+    logger = logging.get_logger("transformers.onnx")  # pylint: disable=invalid-name
+    logger.setLevel(logging.INFO)
+    main()
diff --git a/docs/transformers/build/lib/transformers/onnx/config.py b/docs/transformers/build/lib/transformers/onnx/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..460ee9329977bdb2aadc54aae72e0cd93792aab0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/config.py
@@ -0,0 +1,741 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import dataclasses
+import warnings
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union
+
+import numpy as np
+from packaging import version
+
+from ..utils import TensorType, is_torch_available, is_vision_available, logging
+from .utils import ParameterFormat, compute_effective_axis_dimension, compute_serialized_parameters_size
+
+
+if TYPE_CHECKING:
+    from ..configuration_utils import PretrainedConfig
+    from ..feature_extraction_utils import FeatureExtractionMixin
+    from ..image_processing_utils import ImageProcessingMixin
+    from ..tokenization_utils_base import PreTrainedTokenizerBase
+
+
+if is_vision_available():
+    from PIL import Image
+
+logger = logging.get_logger(__name__)
+
+
+DEFAULT_ONNX_OPSET = 11
+
+# 2 Gb
+EXTERNAL_DATA_FORMAT_SIZE_LIMIT = 2 * 1024 * 1024 * 1024
+
+
+@dataclasses.dataclass
+class PatchingSpec:
+    """
+    Data class that holds patching specifications.
+
+    Args:
+        o: Module / object where the op to patch is located
+        name: Name of the op to monkey patch
+        custom_op: Custom op that patches the original op
+        orig_op: Original op that is being patched
+        op_wrapper: Wrapper (optional) that wraps both the original and custom ops.
+            It is useful for ops that are class or static methods for instance.
+    """
+
+    o: Any
+    name: str
+    custom_op: Callable
+    orig_op: Optional[Callable] = None
+    op_wrapper: Optional[Callable] = None
+
+
+class OnnxConfig(ABC):
+    """
+    Base class for ONNX exportable model describing metadata on how to export the model through the ONNX format.
+    """
+
+    default_fixed_batch = 2
+    default_fixed_sequence = 8
+    default_fixed_num_choices = 4
+    torch_onnx_minimum_version = version.parse("1.8")
+    _tasks_to_common_outputs = {
+        "causal-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "default": OrderedDict({"last_hidden_state": {0: "batch", 1: "sequence"}}),
+        "image-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "image-segmentation": OrderedDict(
+            {
+                "logits": {0: "batch", 1: "sequence"},
+                "pred_boxes": {0: "batch", 1: "sequence"},
+                "pred_masks": {0: "batch", 1: "sequence"},
+            }
+        ),
+        "masked-im": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "masked-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "multiple-choice": OrderedDict({"logits": {0: "batch"}}),
+        "object-detection": OrderedDict(
+            {
+                "logits": {0: "batch", 1: "sequence"},
+                "pred_boxes": {0: "batch", 1: "sequence"},
+            }
+        ),
+        "question-answering": OrderedDict(
+            {
+                "start_logits": {0: "batch", 1: "sequence"},
+                "end_logits": {0: "batch", 1: "sequence"},
+            }
+        ),
+        "semantic-segmentation": OrderedDict({"logits": {0: "batch", 1: "num_labels", 2: "height", 3: "width"}}),
+        "seq2seq-lm": OrderedDict({"logits": {0: "batch", 1: "decoder_sequence"}}),
+        "sequence-classification": OrderedDict({"logits": {0: "batch"}}),
+        "token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+        "speech2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
+    }
+
+    def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
+        self._config = config
+
+        if task not in self._tasks_to_common_outputs:
+            raise ValueError(
+                f"{task} is not a supported task, supported tasks: {self._tasks_to_common_outputs.keys()}"
+            )
+        self.task = task
+
+        self._patching_specs = []
+        for spec in patching_specs if patching_specs is not None else []:
+            final_spec = spec
+            if spec.orig_op is None:
+                final_spec = dataclasses.replace(spec, orig_op=getattr(spec.o, spec.name))
+            self._patching_specs.append(final_spec)
+
+    @classmethod
+    def from_model_config(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfig":
+        """
+        Instantiate a OnnxConfig for a specific model
+
+        Args:
+            config: The model's configuration to use when exporting to ONNX
+
+        Returns:
+            OnnxConfig for this model
+        """
+        return cls(config, task=task)
+
+    @property
+    @abstractmethod
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        """
+        Mapping containing the axis definition of the input tensors to provide to the model
+
+        Returns:
+            For each input: its name associated to the axes symbolic name and the axis position within the tensor
+        """
+        raise NotImplementedError()
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        """
+        Mapping containing the axis definition of the output tensors to provide to the model
+
+        Returns:
+            For each output: its name associated to the axes symbolic name and the axis position within the tensor
+        """
+        common_outputs = self._tasks_to_common_outputs[self.task]
+        return copy.deepcopy(common_outputs)
+
+    @property
+    def values_override(self) -> Optional[Mapping[str, Any]]:
+        """
+        Dictionary of keys to override in the model's config before exporting
+
+        Returns:
+            Dictionary with the keys (and their corresponding values) to override
+        """
+        if hasattr(self._config, "use_cache"):
+            return {"use_cache": False}
+
+        return None
+
+    @property
+    def default_batch_size(self) -> int:
+        """
+        The default batch size to use if no other indication
+
+        Returns:
+            Integer > 0
+        """
+        # Using 2 avoid ONNX making assumption about single sample batch
+        return OnnxConfig.default_fixed_batch
+
+    @property
+    def default_sequence_length(self) -> int:
+        """
+        The default sequence length to use if no other indication
+
+        Returns:
+            Integer > 0
+        """
+        return OnnxConfig.default_fixed_sequence
+
+    @property
+    def default_num_choices(self) -> int:
+        """
+        The default number of choices to use if no other indication
+
+        Returns:
+            Integer > 0
+        """
+        return OnnxConfig.default_fixed_num_choices
+
+    @property
+    def default_onnx_opset(self) -> int:
+        """
+        Which onnx opset to use when exporting the model
+
+        Returns:
+            Integer ONNX Opset version
+        """
+        return DEFAULT_ONNX_OPSET
+
+    @property
+    def atol_for_validation(self) -> float:
+        """
+        What absolute tolerance value to use during model conversion validation.
+
+        Returns:
+            Float absolute tolerance value.
+        """
+        return 1e-5
+
+    @property
+    def is_torch_support_available(self) -> bool:
+        """
+        The minimum PyTorch version required to export the model.
+
+        Returns:
+            `bool`: Whether the installed version of PyTorch is compatible with the model.
+        """
+        if is_torch_available():
+            from transformers.utils import get_torch_version
+
+            return version.parse(get_torch_version()) >= self.torch_onnx_minimum_version
+        else:
+            return False
+
+    @staticmethod
+    def use_external_data_format(num_parameters: int) -> bool:
+        """
+        Flag indicating if the model requires using external data format
+
+        Args:
+            num_parameters: Number of parameter on the model
+
+        Returns:
+            True if model.num_parameters() * size_of(float32) >= 2Gb False otherwise
+        """
+
+        return (
+            compute_serialized_parameters_size(num_parameters, ParameterFormat.Float)
+            >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT
+        )
+
+    def _generate_dummy_images(
+        self, batch_size: int = 2, num_channels: int = 3, image_height: int = 40, image_width: int = 40
+    ):
+        images = []
+        for _ in range(batch_size):
+            data = np.random.rand(image_height, image_width, num_channels) * 255
+            images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
+        return images
+
+    def _generate_dummy_audio(
+        self, batch_size: int = 2, sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220
+    ):
+        audio_data = []
+        for _ in range(batch_size):
+            # time variable
+            t = np.linspace(0, time_duration, int(time_duration * sampling_rate), endpoint=False)
+
+            # generate pure sine wave at `frequency` Hz
+            audio_data.append(0.5 * np.sin(2 * np.pi * frequency * t))
+
+        return audio_data
+
+    def generate_dummy_inputs(
+        self,
+        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin", "ImageProcessingMixin"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        num_choices: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+        num_channels: int = 3,
+        image_width: int = 40,
+        image_height: int = 40,
+        sampling_rate: int = 22050,
+        time_duration: float = 5.0,
+        frequency: int = 220,
+        tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            preprocessor: ([`PreTrainedTokenizerBase`], [`FeatureExtractionMixin`], or [`ImageProcessingMixin`]):
+                The preprocessor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            num_choices (`int`, *optional*, defaults to -1):
+                The number of candidate answers provided for multiple choice task (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2)
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the tokenizer will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+            sampling_rate (`int`, *optional* defaults to 22050)
+                The sampling rate for audio data generation.
+            time_duration (`float`, *optional* defaults to 5.0)
+                Total seconds of sampling for audio data generation.
+            frequency (`int`, *optional* defaults to 220)
+                The desired natural frequency of generated audio.
+
+        Returns:
+            Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
+        """
+        from ..feature_extraction_utils import FeatureExtractionMixin
+        from ..image_processing_utils import ImageProcessingMixin
+        from ..tokenization_utils_base import PreTrainedTokenizerBase
+
+        if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
+            raise ValueError("You cannot provide both a tokenizer and a preprocessor to generate dummy inputs.")
+        if tokenizer is not None:
+            warnings.warn(
+                "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
+                " `preprocessor` instead.",
+                FutureWarning,
+            )
+            logger.warning("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
+            preprocessor = tokenizer
+        if isinstance(preprocessor, PreTrainedTokenizerBase):
+            # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+            batch_size = compute_effective_axis_dimension(
+                batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+            )
+            # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+            token_to_add = preprocessor.num_special_tokens_to_add(is_pair)
+            seq_length = compute_effective_axis_dimension(
+                seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+            )
+            # Generate dummy inputs according to compute batch and sequence
+            input_token = (
+                preprocessor.unk_token
+                if (preprocessor.unk_token is not None and len(preprocessor.unk_token) > 0)
+                else "0"
+            )
+            dummy_input = [" ".join([input_token]) * seq_length] * batch_size
+            if self.task == "multiple-choice":
+                # If dynamic axis (-1) we forward with a fixed dimension of 4 candidate answers to avoid optimizations
+                # made by ONNX
+                num_choices = compute_effective_axis_dimension(
+                    num_choices, fixed_dimension=OnnxConfig.default_fixed_num_choices, num_token_to_add=0
+                )
+                dummy_input = dummy_input * num_choices
+                # The shape of the tokenized inputs values is [batch_size * num_choices, seq_length]
+                tokenized_input = preprocessor(dummy_input, text_pair=dummy_input)
+                # Unflatten the tokenized inputs values expanding it to the shape [batch_size, num_choices, seq_length]
+                for k, v in tokenized_input.items():
+                    tokenized_input[k] = [v[i : i + num_choices] for i in range(0, len(v), num_choices)]
+                return dict(tokenized_input.convert_to_tensors(tensor_type=framework))
+            return dict(preprocessor(dummy_input, return_tensors=framework))
+        elif isinstance(preprocessor, ImageProcessingMixin):
+            if preprocessor.model_input_names[0] != "pixel_values":
+                raise ValueError(
+                    f"The `preprocessor` is an image processor ({preprocessor.__class__.__name__}) and expects"
+                    f' `model_input_names[0]` to be "pixel_values", but got {preprocessor.model_input_names[0]}'
+                )
+            # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+            batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+            dummy_input = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+            return dict(preprocessor(images=dummy_input, return_tensors=framework))
+        elif isinstance(preprocessor, FeatureExtractionMixin) and preprocessor.model_input_names[0] == "pixel_values":
+            # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+            batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+            dummy_input = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+            return dict(preprocessor(images=dummy_input, return_tensors=framework))
+        elif (
+            isinstance(preprocessor, FeatureExtractionMixin) and preprocessor.model_input_names[0] == "input_features"
+        ):
+            # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+            batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+            dummy_input = self._generate_dummy_audio(batch_size, sampling_rate, time_duration, frequency)
+            return dict(preprocessor(dummy_input, return_tensors=framework))
+        else:
+            raise ValueError(
+                "Unable to generate dummy inputs for the model. Please provide a tokenizer or a preprocessor."
+            )
+
+    def generate_dummy_inputs_onnxruntime(self, reference_model_inputs: Mapping[str, Any]) -> Mapping[str, Any]:
+        """
+        Generate inputs for ONNX Runtime using the reference model inputs. Override this to run inference with seq2seq
+        models which have the encoder and decoder exported as separate ONNX files.
+
+        Args:
+            reference_model_inputs ([`Mapping[str, Tensor]`):
+                Reference inputs for the model.
+
+        Returns:
+            `Mapping[str, Tensor]`: The mapping holding the kwargs to provide to the model's forward function
+        """
+        return reference_model_inputs
+
+    def patch_ops(self):
+        for spec in self._patching_specs:
+            custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
+            setattr(spec.o, spec.name, custom_op)
+
+    def restore_ops(self):
+        for spec in self._patching_specs:
+            orig_op = spec.orig_op if spec.op_wrapper is None else spec.op_wrapper(spec.orig_op)
+            setattr(spec.o, spec.name, orig_op)
+
+    @classmethod
+    def flatten_output_collection_property(cls, name: str, field: Iterable[Any]) -> Dict[str, Any]:
+        """
+        Flatten any potential nested structure expanding the name of the field with the index of the element within the
+        structure.
+
+        Args:
+            name: The name of the nested structure
+            field: The structure to, potentially, be flattened
+
+        Returns:
+            (Dict[str, Any]): Outputs with flattened structure and key mapping this new structure.
+
+        """
+        from itertools import chain
+
+        return {f"{name}.{idx}": item for idx, item in enumerate(chain.from_iterable(field))}
+
+
+class OnnxConfigWithPast(OnnxConfig, ABC):
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs)
+        self.use_past = use_past
+
+    @classmethod
+    def with_past(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfigWithPast":
+        """
+        Instantiate a OnnxConfig with `use_past` attribute set to True
+
+        Args:
+            config: The underlying model's config to use when exporting to ONNX
+
+        Returns:
+            OnnxConfig with `.use_past = True`
+        """
+        return cls(config, task=task, use_past=True)
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_outputs = super().outputs
+        if self.use_past:
+            self.fill_with_past_key_values_(common_outputs, direction="outputs")
+
+        return common_outputs
+
+    @property
+    def values_override(self) -> Optional[Mapping[str, Any]]:
+        if hasattr(self._config, "use_cache"):
+            return {"use_cache": self.use_past}
+
+        return None
+
+    @property
+    def num_layers(self) -> int:
+        """
+        The number of layers attribute retrieved from the model config. Override this for model configs where the
+        number of layers attribute is not called `num_layers`.
+        """
+        if not hasattr(self._config, "num_layers"):
+            raise AttributeError(
+                "could not find the number of layers attribute in the model configuration, override the num_layers"
+                " property of the model OnnxConfig to solve this"
+            )
+        return self._config.num_layers
+
+    @property
+    def num_attention_heads(self) -> int:
+        """
+        The number of attention heads attribute retrieved from the model config. Override this for model configs where
+        the number of attention heads attribute is not called `num_attention_heads`.
+        """
+        if not hasattr(self._config, "num_attention_heads"):
+            raise AttributeError(
+                "could not find the number of attention heads attribute in the model configuration, override the"
+                " num_attention_heads property of the model OnnxConfig to solve this"
+            )
+        return self._config.num_attention_heads
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: "PreTrainedTokenizerBase",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        # TODO: should we set seq_length = 1 when self.use_past = True?
+        common_inputs = super().generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+            batch, seqlen = common_inputs["input_ids"].shape
+            # Not using the same length for past_key_values
+            past_key_values_length = seqlen + 2
+            shape = (
+                batch,
+                self.num_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // self.num_attention_heads,
+            )
+
+            if "attention_mask" in common_inputs:
+                mask_dtype = common_inputs["attention_mask"].dtype
+                common_inputs["attention_mask"] = torch.cat(
+                    [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)],
+                    dim=1,
+                )
+
+            common_inputs["past_key_values"] = []
+            for _ in range(self.num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+
+        return common_inputs
+
+    def fill_with_past_key_values_(
+        self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str, inverted_values_shape: bool = False
+    ):
+        """
+        Fill the input_or_outputs mapping with past_key_values dynamic axes considering.
+
+        Args:
+            inputs_or_outputs: The mapping to fill.
+            direction: either "inputs" or "outputs", it specifies whether input_or_outputs is the input mapping or the
+                output mapping, this is important for axes naming.
+            inverted_values_shape:
+                If `True`, store values on dynamic axis 1, else on axis 2.
+
+        """
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        name = "past_key_values" if direction == "inputs" else "present"
+        for i in range(self.num_layers):
+            inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+            if inverted_values_shape:
+                inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 1: "past_sequence + sequence"}
+            else:
+                inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        flattened_output[f"{name}.{idx}.key"] = t[0]
+        flattened_output[f"{name}.{idx}.value"] = t[1]
+
+    def flatten_output_collection_property(self, name: str, field: Iterable[Any]) -> Dict[str, Any]:
+        flattened_output = {}
+        if name in ["present", "past_key_values"]:
+            for idx, t in enumerate(field):
+                self._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super().flatten_output_collection_property(name, field)
+
+        return flattened_output
+
+
+class OnnxSeq2SeqConfigWithPast(OnnxConfigWithPast):
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_outputs = super(OnnxConfigWithPast, self).outputs
+        # Renaming the outputs axes properly.
+        for name, axes_names in common_outputs.items():
+            sequence_name = "encoder_sequence" if "encoder" in name else "decoder_sequence"
+            for axis_idx, name in axes_names.items():
+                if "sequence" in name:
+                    axes_names[axis_idx] = sequence_name
+                # We reset the value as the order in common_outputs (OrderedDict) is lost otherwise
+                else:
+                    axes_names[axis_idx] = name
+        if self.use_past:
+            self.fill_with_past_key_values_(common_outputs, direction="outputs")
+
+        return common_outputs
+
+    @property
+    def num_layers(self) -> Tuple[int]:
+        try:
+            num_layers = super().num_layers
+            num_layers = (num_layers, num_layers)
+        except AttributeError:
+            if hasattr(self._config, "encoder_layers") and hasattr(self._config, "decoder_layers"):
+                num_layers = (self._config.encoder_layers, self._config.decoder_layers)
+            else:
+                raise AttributeError(
+                    "could not find the number of encoder and decoder layers attributes in the model configuration,"
+                    " override the num_layers property of the model OnnxConfig to solve this"
+                )
+
+        return num_layers
+
+    @property
+    def num_attention_heads(self) -> Tuple[int]:
+        try:
+            num_attention_heads = super().num_attention_heads
+            num_attention_heads = (num_attention_heads, num_attention_heads)
+        except AttributeError:
+            if hasattr(self._config, "encoder_attention_heads") and hasattr(self._config, "decoder_attention_heads"):
+                num_attention_heads = (self._config.encoder_attention_heads, self._config.decoder_attention_heads)
+            else:
+                raise AttributeError(
+                    "could not find the number of attention heads for the encoder and the decoder attributes in the"
+                    " model configuration, override the num_attention_heads property of the model OnnxConfig to solve"
+                    " this"
+                )
+        return num_attention_heads
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: Optional["PreTrainedTokenizerBase"],
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=decoder_seq_length, is_pair=is_pair, framework=framework
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch = common_inputs["input_ids"].shape[0]
+            encoder_seq_length = common_inputs["input_ids"].shape[1]
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                # Not using the same length for past_key_values
+                decoder_seq_length + 3,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+            for _ in range(min_num_layers):
+                # For encoder-decoder models, past_key_values contains pre-computed values for both the encoder and the
+                # decoder layers, hence a tuple of 4 tensors instead of 2
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+
+        return common_inputs
+
+    def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        name = "past_key_values" if direction == "inputs" else "present"
+
+        # If the number of encoder and decoder layers are present in the model configuration, both are considered
+        num_encoder_layers, num_decoder_layers = self.num_layers
+        min_num_layers = min(num_encoder_layers, num_decoder_layers)
+        max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+        remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+        encoder_sequence = "past_encoder_sequence"
+        decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"
+
+        for i in range(min_num_layers):
+            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}
+
+        for i in range(min_num_layers, max_num_layers):
+            if remaining_side_name == "encoder":
+                axes_info = {0: "batch", 2: encoder_sequence}
+            else:
+                axes_info = {0: "batch", 2: decoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.{remaining_side_name}.key"] = axes_info
+
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        flattened_output[f"{name}.{idx}.decoder.key"] = t[0]
+        flattened_output[f"{name}.{idx}.decoder.value"] = t[1]
+        flattened_output[f"{name}.{idx}.encoder.key"] = t[2]
+        flattened_output[f"{name}.{idx}.encoder.value"] = t[3]
diff --git a/docs/transformers/build/lib/transformers/onnx/convert.py b/docs/transformers/build/lib/transformers/onnx/convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..58bc51f8e801cfaeb9423342f330c269fa5c2318
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/convert.py
@@ -0,0 +1,460 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from inspect import signature
+from itertools import chain
+from pathlib import Path
+from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+from packaging.version import Version, parse
+
+from ..tokenization_utils_base import PreTrainedTokenizerBase
+from ..utils import (
+    TensorType,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
+from .config import OnnxConfig
+
+
+if is_torch_available():
+    from ..modeling_utils import PreTrainedModel
+
+if is_tf_available():
+    from ..modeling_tf_utils import TFPreTrainedModel
+
+if TYPE_CHECKING:
+    from ..feature_extraction_utils import FeatureExtractionMixin
+    from ..processing_utils import ProcessorMixin
+    from ..tokenization_utils import PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# This is the minimal required version to support some ONNX Runtime features
+ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")
+
+
+def check_onnxruntime_requirements(minimum_version: Version):
+    """
+    Check onnxruntime is installed and if the installed version match is recent enough
+
+    Raises:
+        ImportError: If onnxruntime is not installed or too old version is found
+    """
+    try:
+        import onnxruntime
+
+        # Parse the version of the installed onnxruntime
+        ort_version = parse(onnxruntime.__version__)
+
+        # We require 1.4.0 minimum
+        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
+            raise ImportError(
+                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
+                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
+                "Please update onnxruntime by running `pip install --upgrade onnxruntime`"
+            )
+
+    except ImportError:
+        raise ImportError(
+            "onnxruntime doesn't seem to be currently installed. "
+            "Please install the onnxruntime by running `pip install onnxruntime`"
+            " and relaunch the conversion."
+        )
+
+
+def export_pytorch(
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
+    model: "PreTrainedModel",
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
+    device: str = "cpu",
+) -> Tuple[List[str], List[str]]:
+    """
+    Export a PyTorch model to an ONNX Intermediate Representation (IR)
+
+    Args:
+        preprocessor: ([`PreTrainedTokenizer`], [`FeatureExtractionMixin`] or [`ProcessorMixin`]):
+            The preprocessor used for encoding the data.
+        model ([`PreTrainedModel`]):
+            The model to export.
+        config ([`~onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        device (`str`, *optional*, defaults to `cpu`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+
+    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
+        raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")
+    if tokenizer is not None:
+        warnings.warn(
+            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
+            " `preprocessor` instead.",
+            FutureWarning,
+        )
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
+        preprocessor = tokenizer
+
+    if issubclass(type(model), PreTrainedModel):
+        import torch
+        from torch.onnx import export as onnx_export
+
+        logger.info(f"Using framework PyTorch: {torch.__version__}")
+        with torch.no_grad():
+            model.config.return_dict = True
+            model.eval()
+
+            # Check if we need to override certain configuration item
+            if config.values_override is not None:
+                logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
+                for override_config_key, override_config_value in config.values_override.items():
+                    logger.info(f"\t- {override_config_key} -> {override_config_value}")
+                    setattr(model.config, override_config_key, override_config_value)
+
+            # Ensure inputs match
+            # TODO: Check when exporting QA we provide "is_pair=True"
+            model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH)
+            device = torch.device(device)
+            if device.type == "cuda" and torch.cuda.is_available():
+                model.to(device)
+                model_inputs_device = {}
+                for k, v in model_inputs.items():
+                    if isinstance(v, Tuple):
+                        model_inputs_device[k] = tuple(
+                            x.to(device) if isinstance(x, torch.Tensor) else None for x in v
+                        )
+                    elif isinstance(v, List):
+                        model_inputs_device[k] = [
+                            tuple(x.to(device) if isinstance(x, torch.Tensor) else None for x in t) for t in v
+                        ]
+                    else:
+                        model_inputs_device[k] = v.to(device)
+
+                model_inputs = model_inputs_device
+
+            inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
+            onnx_outputs = list(config.outputs.keys())
+
+            if not inputs_match:
+                raise ValueError("Model and config inputs doesn't match")
+
+            config.patch_ops()
+
+            onnx_export(
+                model,
+                (model_inputs,),
+                f=output.as_posix(),
+                input_names=list(config.inputs.keys()),
+                output_names=onnx_outputs,
+                dynamic_axes=dict(chain(config.inputs.items(), config.outputs.items())),
+                do_constant_folding=True,
+                opset_version=opset,
+            )
+
+            config.restore_ops()
+
+    return matched_inputs, onnx_outputs
+
+
+def export_tensorflow(
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
+    model: "TFPreTrainedModel",
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
+) -> Tuple[List[str], List[str]]:
+    """
+    Export a TensorFlow model to an ONNX Intermediate Representation (IR)
+
+    Args:
+        preprocessor: ([`PreTrainedTokenizer`] or [`FeatureExtractionMixin`]):
+            The preprocessor used for encoding the data.
+        model ([`TFPreTrainedModel`]):
+            The model to export.
+        config ([`~onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    import onnx
+    import tensorflow as tf
+    import tf2onnx
+
+    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
+        raise ValueError("You cannot provide both a tokenizer and preprocessor to export the model.")
+    if tokenizer is not None:
+        warnings.warn(
+            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
+            " `preprocessor` instead.",
+            FutureWarning,
+        )
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
+        preprocessor = tokenizer
+
+    model.config.return_dict = True
+
+    # Check if we need to override certain configuration item
+    if config.values_override is not None:
+        logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
+        for override_config_key, override_config_value in config.values_override.items():
+            logger.info(f"\t- {override_config_key} -> {override_config_value}")
+            setattr(model.config, override_config_key, override_config_value)
+
+    # Ensure inputs match
+    model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW)
+    inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
+    onnx_outputs = list(config.outputs.keys())
+
+    input_signature = [
+        tf.TensorSpec([None] * tensor.ndim, dtype=tensor.dtype, name=key) for key, tensor in model_inputs.items()
+    ]
+    onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=opset)
+    onnx.save(onnx_model, output.as_posix())
+    config.restore_ops()
+
+    return matched_inputs, onnx_outputs
+
+
+def export(
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    config: OnnxConfig,
+    opset: int,
+    output: Path,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
+    device: str = "cpu",
+) -> Tuple[List[str], List[str]]:
+    """
+    Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR)
+
+    Args:
+        preprocessor: ([`PreTrainedTokenizer`], [`FeatureExtractionMixin`] or [`ProcessorMixin`]):
+            The preprocessor used for encoding the data.
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model to export.
+        config ([`~onnx.config.OnnxConfig`]):
+            The ONNX configuration associated with the exported model.
+        opset (`int`):
+            The version of the ONNX operator set to use.
+        output (`Path`):
+            Directory to store the exported ONNX model.
+        device (`str`, *optional*, defaults to `cpu`):
+            The device on which the ONNX model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for
+            export on CUDA devices.
+
+    Returns:
+        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
+        the ONNX configuration.
+    """
+    if not (is_torch_available() or is_tf_available()):
+        raise ImportError(
+            "Cannot convert because neither PyTorch nor TensorFlow are not installed. "
+            "Please install torch or tensorflow first."
+        )
+
+    if is_tf_available() and isinstance(model, TFPreTrainedModel) and device == "cuda":
+        raise RuntimeError("`tf2onnx` does not support export on CUDA device.")
+
+    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
+        raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")
+    if tokenizer is not None:
+        warnings.warn(
+            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
+            " `preprocessor` instead.",
+            FutureWarning,
+        )
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
+        preprocessor = tokenizer
+
+    if is_torch_available():
+        from ..utils import get_torch_version
+
+        if not config.is_torch_support_available:
+            logger.warning(
+                f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"
+                f" got: {get_torch_version()}"
+            )
+
+    if is_torch_available() and issubclass(type(model), PreTrainedModel):
+        return export_pytorch(preprocessor, model, config, opset, output, tokenizer=tokenizer, device=device)
+    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
+        return export_tensorflow(preprocessor, model, config, opset, output, tokenizer=tokenizer)
+
+
+def validate_model_outputs(
+    config: OnnxConfig,
+    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
+    reference_model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    onnx_model: Path,
+    onnx_named_outputs: List[str],
+    atol: float,
+    tokenizer: Optional["PreTrainedTokenizer"] = None,
+):
+    from onnxruntime import InferenceSession, SessionOptions
+
+    logger.info("Validating ONNX model...")
+
+    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
+        raise ValueError("You cannot provide both a tokenizer and a preprocessor to validate the model outputs.")
+    if tokenizer is not None:
+        warnings.warn(
+            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
+            " `preprocessor` instead.",
+            FutureWarning,
+        )
+        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
+        preprocessor = tokenizer
+
+    # generate inputs with a different batch_size and seq_len that was used for conversion to properly test
+    # dynamic input shapes.
+    if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
+        reference_model_inputs = config.generate_dummy_inputs(
+            preprocessor,
+            batch_size=config.default_fixed_batch + 1,
+            seq_length=config.default_fixed_sequence + 1,
+            framework=TensorType.PYTORCH,
+        )
+    else:
+        reference_model_inputs = config.generate_dummy_inputs(
+            preprocessor,
+            batch_size=config.default_fixed_batch + 1,
+            seq_length=config.default_fixed_sequence + 1,
+            framework=TensorType.TENSORFLOW,
+        )
+
+    # Create ONNX Runtime session
+    options = SessionOptions()
+    session = InferenceSession(onnx_model.as_posix(), options, providers=["CPUExecutionProvider"])
+
+    # Compute outputs from the reference model
+    if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
+        reference_model.to("cpu")
+    ref_outputs = reference_model(**reference_model_inputs)
+    ref_outputs_dict = {}
+
+    # We flatten potential collection of outputs (i.e. past_keys) to a flat structure
+    for name, value in ref_outputs.items():
+        # Overwriting the output name as "present" since it is the name used for the ONNX outputs
+        # ("past_key_values" being taken for the ONNX inputs)
+        if name == "past_key_values":
+            name = "present"
+        if isinstance(value, (list, tuple)):
+            value = config.flatten_output_collection_property(name, value)
+            ref_outputs_dict.update(value)
+        else:
+            ref_outputs_dict[name] = value
+
+    # Create onnxruntime inputs from the reference model inputs
+    reference_model_inputs_onnxruntime = config.generate_dummy_inputs_onnxruntime(reference_model_inputs)
+
+    # We flatten potential collection of inputs (i.e. past_keys)
+    onnx_inputs = {}
+    for name, value in reference_model_inputs_onnxruntime.items():
+        if isinstance(value, (list, tuple)):
+            value = config.flatten_output_collection_property(name, value)
+            onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
+        else:
+            onnx_inputs[name] = value.numpy()
+
+    # Compute outputs from the ONNX model
+    onnx_outputs = session.run(onnx_named_outputs, onnx_inputs)
+
+    # Check we have a subset of the keys into onnx_outputs against ref_outputs
+    ref_outputs_set, onnx_outputs_set = set(ref_outputs_dict.keys()), set(onnx_named_outputs)
+    if not onnx_outputs_set.issubset(ref_outputs_set):
+        logger.info(
+            f"\t-[x] ONNX model output names {onnx_outputs_set} do not match reference model {ref_outputs_set}"
+        )
+
+        raise ValueError(
+            "Outputs doesn't match between reference model and ONNX exported model: "
+            f"{onnx_outputs_set.difference(ref_outputs_set)}"
+        )
+    else:
+        logger.info(f"\t-[✓] ONNX model output names match reference model ({onnx_outputs_set})")
+
+    # Check the shape and values match
+    for name, ort_value in zip(onnx_named_outputs, onnx_outputs):
+        if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
+            ref_value = ref_outputs_dict[name].detach().numpy()
+        else:
+            ref_value = ref_outputs_dict[name].numpy()
+        logger.info(f'\t- Validating ONNX Model output "{name}":')
+
+        # Shape
+        if not ort_value.shape == ref_value.shape:
+            logger.info(f"\t\t-[x] shape {ort_value.shape} doesn't match {ref_value.shape}")
+            raise ValueError(
+                "Outputs shape doesn't match between reference model and ONNX exported model: "
+                f"Got {ref_value.shape} (reference) and {ort_value.shape} (ONNX)"
+            )
+        else:
+            logger.info(f"\t\t-[✓] {ort_value.shape} matches {ref_value.shape}")
+
+        # Values
+        if not np.allclose(ref_value, ort_value, atol=atol):
+            bad_indices = np.logical_not(np.isclose(ref_value, ort_value, atol=atol))
+            logger.info(f"\t\t-[x] values not close enough (atol: {atol})")
+            raise ValueError(
+                "Outputs values doesn't match between reference model and ONNX exported model: "
+                f"Got max absolute difference of: {np.amax(np.abs(ref_value - ort_value))} for "
+                f"{ref_value[bad_indices]} vs {ort_value[bad_indices]}"
+            )
+        else:
+            logger.info(f"\t\t-[✓] all values close (atol: {atol})")
+
+
+def ensure_model_and_config_inputs_match(
+    model: Union["PreTrainedModel", "TFPreTrainedModel"], model_inputs: Iterable[str]
+) -> Tuple[bool, List[str]]:
+    """
+
+    :param model_inputs: :param config_inputs: :return:
+    """
+    if is_torch_available() and issubclass(type(model), PreTrainedModel):
+        forward_parameters = signature(model.forward).parameters
+    else:
+        forward_parameters = signature(model.call).parameters
+    model_inputs_set = set(model_inputs)
+
+    # We are fine if config_inputs has more keys than model_inputs
+    forward_inputs_set = set(forward_parameters.keys())
+    is_ok = model_inputs_set.issubset(forward_inputs_set)
+
+    # Make sure the input order match (VERY IMPORTANT !!!!)
+    matching_inputs = forward_inputs_set.intersection(model_inputs_set)
+    ordered_inputs = [parameter for parameter in forward_parameters.keys() if parameter in matching_inputs]
+    return is_ok, ordered_inputs
diff --git a/docs/transformers/build/lib/transformers/onnx/features.py b/docs/transformers/build/lib/transformers/onnx/features.py
new file mode 100644
index 0000000000000000000000000000000000000000..da9ca2355fef785f76f727a66b36bfb95a24ee51
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/features.py
@@ -0,0 +1,749 @@
+import os
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, Union
+
+import transformers
+
+from .. import PretrainedConfig, is_tf_available, is_torch_available
+from ..utils import TF2_WEIGHTS_NAME, WEIGHTS_NAME, logging
+from .config import OnnxConfig
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel, TFPreTrainedModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_torch_available():
+    from transformers.models.auto import (
+        AutoModel,
+        AutoModelForCausalLM,
+        AutoModelForImageClassification,
+        AutoModelForImageSegmentation,
+        AutoModelForMaskedImageModeling,
+        AutoModelForMaskedLM,
+        AutoModelForMultipleChoice,
+        AutoModelForObjectDetection,
+        AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForSpeechSeq2Seq,
+        AutoModelForTokenClassification,
+        AutoModelForVision2Seq,
+    )
+if is_tf_available():
+    from transformers.models.auto import (
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForMultipleChoice,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSemanticSegmentation,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForTokenClassification,
+    )
+if not is_torch_available() and not is_tf_available():
+    logger.warning(
+        "The ONNX export features are only supported for PyTorch or TensorFlow. You will not be able to export models"
+        " without one of these libraries installed."
+    )
+
+
+def supported_features_mapping(
+    *supported_features: str, onnx_config_cls: Optional[str] = None
+) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
+    """
+    Generate the mapping between supported the features and their corresponding OnnxConfig for a given model.
+
+    Args:
+        *supported_features: The names of the supported features.
+        onnx_config_cls: The OnnxConfig full name corresponding to the model.
+
+    Returns:
+        The dictionary mapping a feature to an OnnxConfig constructor.
+    """
+    if onnx_config_cls is None:
+        raise ValueError("A OnnxConfig class must be provided")
+
+    config_cls = transformers
+    for attr_name in onnx_config_cls.split("."):
+        config_cls = getattr(config_cls, attr_name)
+    mapping = {}
+    for feature in supported_features:
+        if "-with-past" in feature:
+            task = feature.replace("-with-past", "")
+            mapping[feature] = partial(config_cls.with_past, task=task)
+        else:
+            mapping[feature] = partial(config_cls.from_model_config, task=feature)
+
+    return mapping
+
+
+class FeaturesManager:
+    _TASKS_TO_AUTOMODELS = {}
+    _TASKS_TO_TF_AUTOMODELS = {}
+    if is_torch_available():
+        _TASKS_TO_AUTOMODELS = {
+            "default": AutoModel,
+            "masked-lm": AutoModelForMaskedLM,
+            "causal-lm": AutoModelForCausalLM,
+            "seq2seq-lm": AutoModelForSeq2SeqLM,
+            "sequence-classification": AutoModelForSequenceClassification,
+            "token-classification": AutoModelForTokenClassification,
+            "multiple-choice": AutoModelForMultipleChoice,
+            "object-detection": AutoModelForObjectDetection,
+            "question-answering": AutoModelForQuestionAnswering,
+            "image-classification": AutoModelForImageClassification,
+            "image-segmentation": AutoModelForImageSegmentation,
+            "masked-im": AutoModelForMaskedImageModeling,
+            "semantic-segmentation": AutoModelForSemanticSegmentation,
+            "vision2seq-lm": AutoModelForVision2Seq,
+            "speech2seq-lm": AutoModelForSpeechSeq2Seq,
+        }
+    if is_tf_available():
+        _TASKS_TO_TF_AUTOMODELS = {
+            "default": TFAutoModel,
+            "masked-lm": TFAutoModelForMaskedLM,
+            "causal-lm": TFAutoModelForCausalLM,
+            "seq2seq-lm": TFAutoModelForSeq2SeqLM,
+            "sequence-classification": TFAutoModelForSequenceClassification,
+            "token-classification": TFAutoModelForTokenClassification,
+            "multiple-choice": TFAutoModelForMultipleChoice,
+            "question-answering": TFAutoModelForQuestionAnswering,
+            "semantic-segmentation": TFAutoModelForSemanticSegmentation,
+        }
+
+    # Set of model topologies we support associated to the features supported by each topology and the factory
+    _SUPPORTED_MODEL_TYPE = {
+        "albert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.albert.AlbertOnnxConfig",
+        ),
+        "bart": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            "sequence-classification",
+            "question-answering",
+            onnx_config_cls="models.bart.BartOnnxConfig",
+        ),
+        # BEiT cannot be used with the masked image modeling autoclass, so this feature is excluded here
+        "beit": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.beit.BeitOnnxConfig"
+        ),
+        "bert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.bert.BertOnnxConfig",
+        ),
+        "big-bird": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.big_bird.BigBirdOnnxConfig",
+        ),
+        "bigbird-pegasus": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            "sequence-classification",
+            "question-answering",
+            onnx_config_cls="models.bigbird_pegasus.BigBirdPegasusOnnxConfig",
+        ),
+        "blenderbot": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.blenderbot.BlenderbotOnnxConfig",
+        ),
+        "blenderbot-small": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.blenderbot_small.BlenderbotSmallOnnxConfig",
+        ),
+        "bloom": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.bloom.BloomOnnxConfig",
+        ),
+        "camembert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.camembert.CamembertOnnxConfig",
+        ),
+        "clip": supported_features_mapping(
+            "default",
+            onnx_config_cls="models.clip.CLIPOnnxConfig",
+        ),
+        "codegen": supported_features_mapping(
+            "default",
+            "causal-lm",
+            onnx_config_cls="models.codegen.CodeGenOnnxConfig",
+        ),
+        "convbert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.convbert.ConvBertOnnxConfig",
+        ),
+        "convnext": supported_features_mapping(
+            "default",
+            "image-classification",
+            onnx_config_cls="models.convnext.ConvNextOnnxConfig",
+        ),
+        "data2vec-text": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.data2vec.Data2VecTextOnnxConfig",
+        ),
+        "data2vec-vision": supported_features_mapping(
+            "default",
+            "image-classification",
+            # ONNX doesn't support `adaptive_avg_pool2d` yet
+            # "semantic-segmentation",
+            onnx_config_cls="models.data2vec.Data2VecVisionOnnxConfig",
+        ),
+        "deberta": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.deberta.DebertaOnnxConfig",
+        ),
+        "deberta-v2": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.deberta_v2.DebertaV2OnnxConfig",
+        ),
+        "deit": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.deit.DeiTOnnxConfig"
+        ),
+        "detr": supported_features_mapping(
+            "default",
+            "object-detection",
+            "image-segmentation",
+            onnx_config_cls="models.detr.DetrOnnxConfig",
+        ),
+        "distilbert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.distilbert.DistilBertOnnxConfig",
+        ),
+        "electra": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.electra.ElectraOnnxConfig",
+        ),
+        "flaubert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.flaubert.FlaubertOnnxConfig",
+        ),
+        "gpt2": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.gpt2.GPT2OnnxConfig",
+        ),
+        "gptj": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "question-answering",
+            "sequence-classification",
+            onnx_config_cls="models.gptj.GPTJOnnxConfig",
+        ),
+        "gpt-neo": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "sequence-classification",
+            onnx_config_cls="models.gpt_neo.GPTNeoOnnxConfig",
+        ),
+        "groupvit": supported_features_mapping(
+            "default",
+            onnx_config_cls="models.groupvit.GroupViTOnnxConfig",
+        ),
+        "ibert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.ibert.IBertOnnxConfig",
+        ),
+        "imagegpt": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.imagegpt.ImageGPTOnnxConfig"
+        ),
+        "layoutlm": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.layoutlm.LayoutLMOnnxConfig",
+        ),
+        "layoutlmv3": supported_features_mapping(
+            "default",
+            "question-answering",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.layoutlmv3.LayoutLMv3OnnxConfig",
+        ),
+        "levit": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.levit.LevitOnnxConfig"
+        ),
+        "longt5": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.longt5.LongT5OnnxConfig",
+        ),
+        "longformer": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "multiple-choice",
+            "question-answering",
+            "sequence-classification",
+            "token-classification",
+            onnx_config_cls="models.longformer.LongformerOnnxConfig",
+        ),
+        "marian": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            onnx_config_cls="models.marian.MarianOnnxConfig",
+        ),
+        "mbart": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "causal-lm",
+            "causal-lm-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            "sequence-classification",
+            "question-answering",
+            onnx_config_cls="models.mbart.MBartOnnxConfig",
+        ),
+        "mobilebert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.mobilebert.MobileBertOnnxConfig",
+        ),
+        "mobilenet-v1": supported_features_mapping(
+            "default",
+            "image-classification",
+            onnx_config_cls="models.mobilenet_v1.MobileNetV1OnnxConfig",
+        ),
+        "mobilenet-v2": supported_features_mapping(
+            "default",
+            "image-classification",
+            onnx_config_cls="models.mobilenet_v2.MobileNetV2OnnxConfig",
+        ),
+        "mobilevit": supported_features_mapping(
+            "default",
+            "image-classification",
+            onnx_config_cls="models.mobilevit.MobileViTOnnxConfig",
+        ),
+        "mt5": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.mt5.MT5OnnxConfig",
+        ),
+        "m2m-100": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.m2m_100.M2M100OnnxConfig",
+        ),
+        "owlvit": supported_features_mapping(
+            "default",
+            onnx_config_cls="models.owlvit.OwlViTOnnxConfig",
+        ),
+        "perceiver": supported_features_mapping(
+            "image-classification",
+            "masked-lm",
+            "sequence-classification",
+            onnx_config_cls="models.perceiver.PerceiverOnnxConfig",
+        ),
+        "poolformer": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.poolformer.PoolFormerOnnxConfig"
+        ),
+        "rembert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.rembert.RemBertOnnxConfig",
+        ),
+        "resnet": supported_features_mapping(
+            "default",
+            "image-classification",
+            onnx_config_cls="models.resnet.ResNetOnnxConfig",
+        ),
+        "roberta": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.roberta.RobertaOnnxConfig",
+        ),
+        "roformer": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "token-classification",
+            "multiple-choice",
+            "question-answering",
+            "token-classification",
+            onnx_config_cls="models.roformer.RoFormerOnnxConfig",
+        ),
+        "segformer": supported_features_mapping(
+            "default",
+            "image-classification",
+            "semantic-segmentation",
+            onnx_config_cls="models.segformer.SegformerOnnxConfig",
+        ),
+        "squeezebert": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.squeezebert.SqueezeBertOnnxConfig",
+        ),
+        "swin": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.swin.SwinOnnxConfig"
+        ),
+        "t5": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "seq2seq-lm",
+            "seq2seq-lm-with-past",
+            onnx_config_cls="models.t5.T5OnnxConfig",
+        ),
+        "vision-encoder-decoder": supported_features_mapping(
+            "vision2seq-lm", onnx_config_cls="models.vision_encoder_decoder.VisionEncoderDecoderOnnxConfig"
+        ),
+        "vit": supported_features_mapping(
+            "default", "image-classification", onnx_config_cls="models.vit.ViTOnnxConfig"
+        ),
+        "whisper": supported_features_mapping(
+            "default",
+            "default-with-past",
+            "speech2seq-lm",
+            "speech2seq-lm-with-past",
+            onnx_config_cls="models.whisper.WhisperOnnxConfig",
+        ),
+        "xlm": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.xlm.XLMOnnxConfig",
+        ),
+        "xlm-roberta": supported_features_mapping(
+            "default",
+            "masked-lm",
+            "causal-lm",
+            "sequence-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx_config_cls="models.xlm_roberta.XLMRobertaOnnxConfig",
+        ),
+        "yolos": supported_features_mapping(
+            "default",
+            "object-detection",
+            onnx_config_cls="models.yolos.YolosOnnxConfig",
+        ),
+    }
+
+    AVAILABLE_FEATURES = sorted(reduce(lambda s1, s2: s1 | s2, (v.keys() for v in _SUPPORTED_MODEL_TYPE.values())))
+
+    @staticmethod
+    def get_supported_features_for_model_type(
+        model_type: str, model_name: Optional[str] = None
+    ) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
+        """
+        Tries to retrieve the feature -> OnnxConfig constructor map from the model type.
+
+        Args:
+            model_type (`str`):
+                The model type to retrieve the supported features for.
+            model_name (`str`, *optional*):
+                The name attribute of the model object, only used for the exception message.
+
+        Returns:
+            The dictionary mapping each feature to a corresponding OnnxConfig constructor.
+        """
+        model_type = model_type.lower()
+        if model_type not in FeaturesManager._SUPPORTED_MODEL_TYPE:
+            model_type_and_model_name = f"{model_type} ({model_name})" if model_name else model_type
+            raise KeyError(
+                f"{model_type_and_model_name} is not supported yet. "
+                f"Only {list(FeaturesManager._SUPPORTED_MODEL_TYPE.keys())} are supported. "
+                f"If you want to support {model_type} please propose a PR or open up an issue."
+            )
+        return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type]
+
+    @staticmethod
+    def feature_to_task(feature: str) -> str:
+        return feature.replace("-with-past", "")
+
+    @staticmethod
+    def _validate_framework_choice(framework: str):
+        """
+        Validates if the framework requested for the export is both correct and available, otherwise throws an
+        exception.
+        """
+        if framework not in ["pt", "tf"]:
+            raise ValueError(
+                f"Only two frameworks are supported for ONNX export: pt or tf, but {framework} was provided."
+            )
+        elif framework == "pt" and not is_torch_available():
+            raise RuntimeError("Cannot export model to ONNX using PyTorch because no PyTorch package was found.")
+        elif framework == "tf" and not is_tf_available():
+            raise RuntimeError("Cannot export model to ONNX using TensorFlow because no TensorFlow package was found.")
+
+    @staticmethod
+    def get_model_class_for_feature(feature: str, framework: str = "pt") -> Type:
+        """
+        Attempts to retrieve an AutoModel class from a feature name.
+
+        Args:
+            feature (`str`):
+                The feature required.
+            framework (`str`, *optional*, defaults to `"pt"`):
+                The framework to use for the export.
+
+        Returns:
+            The AutoModel class corresponding to the feature.
+        """
+        task = FeaturesManager.feature_to_task(feature)
+        FeaturesManager._validate_framework_choice(framework)
+        if framework == "pt":
+            task_to_automodel = FeaturesManager._TASKS_TO_AUTOMODELS
+        else:
+            task_to_automodel = FeaturesManager._TASKS_TO_TF_AUTOMODELS
+        if task not in task_to_automodel:
+            raise KeyError(
+                f"Unknown task: {feature}. Possible values are {list(FeaturesManager._TASKS_TO_AUTOMODELS.values())}"
+            )
+
+        return task_to_automodel[task]
+
+    @staticmethod
+    def determine_framework(model: str, framework: Optional[str] = None) -> str:
+        """
+        Determines the framework to use for the export.
+
+        The priority is in the following order:
+            1. User input via `framework`.
+            2. If local checkpoint is provided, use the same framework as the checkpoint.
+            3. Available framework in environment, with priority given to PyTorch
+
+        Args:
+            model (`str`):
+                The name of the model to export.
+            framework (`str`, *optional*, defaults to `None`):
+                The framework to use for the export. See above for priority if none provided.
+
+        Returns:
+            The framework to use for the export.
+
+        """
+        if framework is not None:
+            return framework
+
+        framework_map = {"pt": "PyTorch", "tf": "TensorFlow"}
+        exporter_map = {"pt": "torch", "tf": "tf2onnx"}
+
+        if os.path.isdir(model):
+            if os.path.isfile(os.path.join(model, WEIGHTS_NAME)):
+                framework = "pt"
+            elif os.path.isfile(os.path.join(model, TF2_WEIGHTS_NAME)):
+                framework = "tf"
+            else:
+                raise FileNotFoundError(
+                    "Cannot determine framework from given checkpoint location."
+                    f" There should be a {WEIGHTS_NAME} for PyTorch"
+                    f" or {TF2_WEIGHTS_NAME} for TensorFlow."
+                )
+            logger.info(f"Local {framework_map[framework]} model found.")
+        else:
+            if is_torch_available():
+                framework = "pt"
+            elif is_tf_available():
+                framework = "tf"
+            else:
+                raise EnvironmentError("Neither PyTorch nor TensorFlow found in environment. Cannot export to ONNX.")
+
+        logger.info(f"Framework not requested. Using {exporter_map[framework]} to export to ONNX.")
+
+        return framework
+
+    @staticmethod
+    def get_model_from_feature(
+        feature: str, model: str, framework: Optional[str] = None, cache_dir: Optional[str] = None
+    ) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
+        """
+        Attempts to retrieve a model from a model's name and the feature to be enabled.
+
+        Args:
+            feature (`str`):
+                The feature required.
+            model (`str`):
+                The name of the model to export.
+            framework (`str`, *optional*, defaults to `None`):
+                The framework to use for the export. See `FeaturesManager.determine_framework` for the priority should
+                none be provided.
+
+        Returns:
+            The instance of the model.
+
+        """
+        framework = FeaturesManager.determine_framework(model, framework)
+        model_class = FeaturesManager.get_model_class_for_feature(feature, framework)
+        try:
+            model = model_class.from_pretrained(model, cache_dir=cache_dir)
+        except OSError:
+            if framework == "pt":
+                logger.info("Loading TensorFlow model in PyTorch before exporting to ONNX.")
+                model = model_class.from_pretrained(model, from_tf=True, cache_dir=cache_dir)
+            else:
+                logger.info("Loading PyTorch model in TensorFlow before exporting to ONNX.")
+                model = model_class.from_pretrained(model, from_pt=True, cache_dir=cache_dir)
+        return model
+
+    @staticmethod
+    def check_supported_model_or_raise(
+        model: Union["PreTrainedModel", "TFPreTrainedModel"], feature: str = "default"
+    ) -> Tuple[str, Callable]:
+        """
+        Check whether or not the model has the requested features.
+
+        Args:
+            model: The model to export.
+            feature: The name of the feature to check if it is available.
+
+        Returns:
+            (str) The type of the model (OnnxConfig) The OnnxConfig instance holding the model export properties.
+
+        """
+        model_type = model.config.model_type.replace("_", "-")
+        model_name = getattr(model, "name", "")
+        model_features = FeaturesManager.get_supported_features_for_model_type(model_type, model_name=model_name)
+        if feature not in model_features:
+            raise ValueError(
+                f"{model.config.model_type} doesn't support feature {feature}. Supported values are: {model_features}"
+            )
+
+        return model.config.model_type, FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]
+
+    def get_config(model_type: str, feature: str) -> OnnxConfig:
+        """
+        Gets the OnnxConfig for a model_type and feature combination.
+
+        Args:
+            model_type (`str`):
+                The model type to retrieve the config for.
+            feature (`str`):
+                The feature to retrieve the config for.
+
+        Returns:
+            `OnnxConfig`: config for the combination
+        """
+        return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]
diff --git a/docs/transformers/build/lib/transformers/onnx/utils.py b/docs/transformers/build/lib/transformers/onnx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9672b0a96af88ffa2c7e791d1f4d7c818174247f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/onnx/utils.py
@@ -0,0 +1,109 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ctypes import c_float, sizeof
+from enum import Enum
+from typing import TYPE_CHECKING, Optional, Union
+
+
+if TYPE_CHECKING:
+    from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer  # tests_ignore
+
+
+class ParameterFormat(Enum):
+    Float = c_float
+
+    @property
+    def size(self) -> int:
+        """
+        Number of byte required for this data type
+
+        Returns:
+            Integer > 0
+        """
+        return sizeof(self.value)
+
+
+def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int:
+    """
+
+    Args:
+        dimension:
+        fixed_dimension:
+        num_token_to_add:
+
+    Returns:
+
+    """
+    # < 0 is possible if using a dynamic axis
+    if dimension <= 0:
+        dimension = fixed_dimension
+
+    dimension -= num_token_to_add
+    return dimension
+
+
+def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int:
+    """
+    Compute the size taken by all the parameters in the given the storage format when serializing the model
+
+    Args:
+        num_parameters: Number of parameters to be saved
+        dtype: The data format each parameter will be saved
+
+    Returns:
+        Size (in byte) taken to save all the parameters
+    """
+    return num_parameters * dtype.size
+
+
+def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]:
+    """
+    Gets a preprocessor (tokenizer, feature extractor or processor) that is available for `model_name`.
+
+    Args:
+        model_name (`str`): Name of the model for which a preprocessor are loaded.
+
+    Returns:
+        `Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`:
+            If a processor is found, it is returned. Otherwise, if a tokenizer or a feature extractor exists, it is
+            returned. If both a tokenizer and a feature extractor exist, an error is raised. The function returns
+            `None` if no preprocessor is found.
+    """
+    # Avoid circular imports by only importing this here.
+    from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer  # tests_ignore
+
+    try:
+        return AutoProcessor.from_pretrained(model_name)
+    except (ValueError, OSError, KeyError):
+        tokenizer, feature_extractor = None, None
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+        except (OSError, KeyError):
+            pass
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
+        except (OSError, KeyError):
+            pass
+
+        if tokenizer is not None and feature_extractor is not None:
+            raise ValueError(
+                f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor."
+            )
+        elif tokenizer is None and feature_extractor is None:
+            return None
+        elif tokenizer is not None:
+            return tokenizer
+        else:
+            return feature_extractor
diff --git a/docs/transformers/build/lib/transformers/pipelines/__init__.py b/docs/transformers/build/lib/transformers/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62b3c3eab23dabef944ace9b5d0582c92aa7c43
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/__init__.py
@@ -0,0 +1,1180 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import model_info
+
+from ..configuration_utils import PretrainedConfig
+from ..dynamic_module_utils import get_class_from_dynamic_module
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
+from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    CONFIG_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    cached_file,
+    extract_commit_hash,
+    find_adapter_config_file,
+    is_kenlm_available,
+    is_offline_mode,
+    is_peft_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
+from .audio_classification import AudioClassificationPipeline
+from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
+from .base import (
+    ArgumentHandler,
+    CsvPipelineDataFormat,
+    JsonPipelineDataFormat,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    PipelineException,
+    PipelineRegistry,
+    get_default_model_and_revision,
+    infer_framework_load_model,
+)
+from .depth_estimation import DepthEstimationPipeline
+from .document_question_answering import DocumentQuestionAnsweringPipeline
+from .feature_extraction import FeatureExtractionPipeline
+from .fill_mask import FillMaskPipeline
+from .image_classification import ImageClassificationPipeline
+from .image_feature_extraction import ImageFeatureExtractionPipeline
+from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
+from .image_to_image import ImageToImagePipeline
+from .image_to_text import ImageToTextPipeline
+from .mask_generation import MaskGenerationPipeline
+from .object_detection import ObjectDetectionPipeline
+from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
+from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+from .text_classification import TextClassificationPipeline
+from .text_generation import TextGenerationPipeline
+from .text_to_audio import TextToAudioPipeline
+from .token_classification import (
+    AggregationStrategy,
+    NerPipeline,
+    TokenClassificationArgumentHandler,
+    TokenClassificationPipeline,
+)
+from .video_classification import VideoClassificationPipeline
+from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
+from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
+from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
+from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForImageClassification,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForTableQuestionAnswering,
+        TFAutoModelForTokenClassification,
+        TFAutoModelForVision2Seq,
+        TFAutoModelForZeroShotImageClassification,
+    )
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        AutoModel,
+        AutoModelForAudioClassification,
+        AutoModelForCausalLM,
+        AutoModelForCTC,
+        AutoModelForDocumentQuestionAnswering,
+        AutoModelForImageClassification,
+        AutoModelForImageSegmentation,
+        AutoModelForImageTextToText,
+        AutoModelForMaskedLM,
+        AutoModelForMaskGeneration,
+        AutoModelForObjectDetection,
+        AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForSpeechSeq2Seq,
+        AutoModelForTableQuestionAnswering,
+        AutoModelForTextToSpectrogram,
+        AutoModelForTextToWaveform,
+        AutoModelForTokenClassification,
+        AutoModelForVideoClassification,
+        AutoModelForVision2Seq,
+        AutoModelForVisualQuestionAnswering,
+        AutoModelForZeroShotImageClassification,
+        AutoModelForZeroShotObjectDetection,
+    )
+
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+# Register all the supported tasks here
+TASK_ALIASES = {
+    "sentiment-analysis": "text-classification",
+    "ner": "token-classification",
+    "vqa": "visual-question-answering",
+    "text-to-speech": "text-to-audio",
+}
+SUPPORTED_TASKS = {
+    "audio-classification": {
+        "impl": AudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}},
+        "type": "audio",
+    },
+    "automatic-speech-recognition": {
+        "impl": AutomaticSpeechRecognitionPipeline,
+        "tf": (),
+        "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "22aad52")}},
+        "type": "multimodal",
+    },
+    "text-to-audio": {
+        "impl": TextToAudioPipeline,
+        "tf": (),
+        "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
+        "default": {"model": {"pt": ("suno/bark-small", "1dbd7a1")}},
+        "type": "text",
+    },
+    "feature-extraction": {
+        "impl": FeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased", "6ea8117"),
+                "tf": ("distilbert/distilbert-base-cased", "6ea8117"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "text-classification": {
+        "impl": TextClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+                "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+            },
+        },
+        "type": "text",
+    },
+    "token-classification": {
+        "impl": TokenClassificationPipeline,
+        "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+                "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+            },
+        },
+        "type": "text",
+    },
+    "question-answering": {
+        "impl": QuestionAnsweringPipeline,
+        "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
+        "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+                "tf": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+            },
+        },
+        "type": "text",
+    },
+    "table-question-answering": {
+        "impl": TableQuestionAnsweringPipeline,
+        "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
+        "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+                "tf": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+            },
+        },
+        "type": "text",
+    },
+    "visual-question-answering": {
+        "impl": VisualQuestionAnsweringPipeline,
+        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("dandelin/vilt-b32-finetuned-vqa", "d0a1f6a")},
+        },
+        "type": "multimodal",
+    },
+    "document-question-answering": {
+        "impl": DocumentQuestionAnsweringPipeline,
+        "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("impira/layoutlm-document-qa", "beed3c4")},
+        },
+        "type": "multimodal",
+    },
+    "fill-mask": {
+        "impl": FillMaskPipeline,
+        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
+        "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilroberta-base", "fb53ab8"),
+                "tf": ("distilbert/distilroberta-base", "fb53ab8"),
+            }
+        },
+        "type": "text",
+    },
+    "summarization": {
+        "impl": SummarizationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            "model": {"pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"), "tf": ("google-t5/t5-small", "df1b051")}
+        },
+        "type": "text",
+    },
+    # This task is a special case as it's parametrized by SRC, TGT languages.
+    "translation": {
+        "impl": TranslationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        },
+        "type": "text",
+    },
+    "text2text-generation": {
+        "impl": Text2TextGenerationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        "type": "text",
+    },
+    "text-generation": {
+        "impl": TextGenerationPipeline,
+        "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),
+        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("openai-community/gpt2", "607a30d"), "tf": ("openai-community/gpt2", "607a30d")}},
+        "type": "text",
+    },
+    "zero-shot-classification": {
+        "impl": ZeroShotClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+            "config": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+        },
+        "type": "text",
+    },
+    "zero-shot-image-classification": {
+        "impl": ZeroShotImageClassificationPipeline,
+        "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("openai/clip-vit-base-patch32", "3d74acf"),
+                "tf": ("openai/clip-vit-base-patch32", "3d74acf"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "zero-shot-audio-classification": {
+        "impl": ZeroShotAudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("laion/clap-htsat-fused", "cca9e28"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-classification": {
+        "impl": ImageClassificationPipeline,
+        "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-feature-extraction": {
+        "impl": ImageFeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-segmentation": {
+        "impl": ImageSegmentationPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "d53b52a")}},
+        "type": "multimodal",
+    },
+    "image-to-text": {
+        "impl": ImageToTextPipeline,
+        "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (),
+        "pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+                "tf": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-text-to-text": {
+        "impl": ImageTextToTextPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "object-detection": {
+        "impl": ObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50", "1d5f47b")}},
+        "type": "multimodal",
+    },
+    "zero-shot-object-detection": {
+        "impl": ZeroShotObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google/owlvit-base-patch32", "cbc355f")}},
+        "type": "multimodal",
+    },
+    "depth-estimation": {
+        "impl": DepthEstimationPipeline,
+        "tf": (),
+        "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("Intel/dpt-large", "bc15f29")}},
+        "type": "image",
+    },
+    "video-classification": {
+        "impl": VideoClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")}},
+        "type": "video",
+    },
+    "mask-generation": {
+        "impl": MaskGenerationPipeline,
+        "tf": (),
+        "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/sam-vit-huge", "87aecf0")}},
+        "type": "multimodal",
+    },
+    "image-to-image": {
+        "impl": ImageToImagePipeline,
+        "tf": (),
+        "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "cee1c92")}},
+        "type": "image",
+    },
+}
+
+NO_FEATURE_EXTRACTOR_TASKS = set()
+NO_IMAGE_PROCESSOR_TASKS = set()
+NO_TOKENIZER_TASKS = set()
+
+# Those model configs are special, they are generic over their task, meaning
+# any tokenizer/feature_extractor might be use for a given model so we cannot
+# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
+# see if the model defines such objects or not.
+MULTI_MODEL_AUDIO_CONFIGS = {"SpeechEncoderDecoderConfig"}
+MULTI_MODEL_VISION_CONFIGS = {"VisionEncoderDecoderConfig", "VisionTextDualEncoderConfig"}
+for task, values in SUPPORTED_TASKS.items():
+    if values["type"] == "text":
+        NO_FEATURE_EXTRACTOR_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+    elif values["type"] in {"image", "video"}:
+        NO_TOKENIZER_TASKS.add(task)
+    elif values["type"] in {"audio"}:
+        NO_TOKENIZER_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+    elif values["type"] != "multimodal":
+        raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
+
+PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
+
+
+def get_supported_tasks() -> List[str]:
+    """
+    Returns a list of supported task strings.
+    """
+    return PIPELINE_REGISTRY.get_supported_tasks()
+
+
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode():
+        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
+    try:
+        info = model_info(model, token=token)
+    except Exception as e:
+        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
+    if not info.pipeline_tag:
+        raise RuntimeError(
+            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
+        )
+    if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}:
+        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
+    task = info.pipeline_tag
+    return task
+
+
+def check_task(task: str) -> Tuple[str, Dict, Any]:
+    """
+    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
+    default models if they exist.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`
+            - `"automatic-speech-recognition"`
+            - `"conversational"`
+            - `"depth-estimation"`
+            - `"document-question-answering"`
+            - `"feature-extraction"`
+            - `"fill-mask"`
+            - `"image-classification"`
+            - `"image-feature-extraction"`
+            - `"image-segmentation"`
+            - `"image-to-text"`
+            - `"image-to-image"`
+            - `"object-detection"`
+            - `"question-answering"`
+            - `"summarization"`
+            - `"table-question-answering"`
+            - `"text2text-generation"`
+            - `"text-classification"` (alias `"sentiment-analysis"` available)
+            - `"text-generation"`
+            - `"text-to-audio"` (alias `"text-to-speech"` available)
+            - `"token-classification"` (alias `"ner"` available)
+            - `"translation"`
+            - `"translation_xx_to_yy"`
+            - `"video-classification"`
+            - `"visual-question-answering"` (alias `"vqa"` available)
+            - `"zero-shot-classification"`
+            - `"zero-shot-image-classification"`
+            - `"zero-shot-object-detection"`
+
+    Returns:
+        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+        options for parametrized tasks like "translation_XX_to_YY"
+
+
+    """
+    return PIPELINE_REGISTRY.check_task(task)
+
+
+def clean_custom_task(task_info):
+    import transformers
+
+    if "impl" not in task_info:
+        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
+    pt_class_names = task_info.get("pt", ())
+    if isinstance(pt_class_names, str):
+        pt_class_names = [pt_class_names]
+    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
+    tf_class_names = task_info.get("tf", ())
+    if isinstance(tf_class_names, str):
+        tf_class_names = [tf_class_names]
+    task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names)
+    return task_info, None
+
+
+def pipeline(
+    task: Optional[str] = None,
+    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
+    framework: Optional[str] = None,
+    revision: Optional[str] = None,
+    use_fast: bool = True,
+    token: Optional[Union[str, bool]] = None,
+    device: Optional[Union[int, str, "torch.device"]] = None,
+    device_map=None,
+    torch_dtype=None,
+    trust_remote_code: Optional[bool] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
+    pipeline_class: Optional[Any] = None,
+    **kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    A pipeline consists of:
+
+        - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer),
+        [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors).
+        - A [model](model) that generates predictions from the inputs.
+        - Optional post-processing steps to refine the model's output, which can also be handled by processors.
+
+    <Tip>
+    While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`,
+    they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load
+    required ones automatically. In case you want to provide these components explicitly, please refer to a
+    specific pipeline in order to get more details regarding what components are required.
+    </Tip>
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
+            - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
+            - `"depth-estimation"`: will return a [`DepthEstimationPipeline`].
+            - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`].
+            - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
+            - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
+            - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+            - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
+            - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+            - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
+            - `"image-to-image"`: will return a [`ImageToImagePipeline`].
+            - `"image-to-text"`: will return a [`ImageToTextPipeline`].
+            - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
+            - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
+            - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+            - `"summarization"`: will return a [`SummarizationPipeline`].
+            - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
+            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
+            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
+              [`TextClassificationPipeline`].
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
+            - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
+            - `"translation"`: will return a [`TranslationPipeline`].
+            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
+            - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+            - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
+            - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
+            - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
+            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
+            - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
+
+        model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or
+            [`TFPreTrainedModel`] (for TensorFlow).
+
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
+            The configuration that will be used by the pipeline to instantiate the model. This can be a model
+            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+            If not provided, the default configuration file for the requested model will be used. That means that if
+            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+            `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
+            The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`].
+
+            Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
+            models. Multi-modal models will also require a tokenizer to be passed.
+
+            If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
+            is a string). However, if `config` is also not given or not a string, then the default feature extractor
+            for the given `task` will be loaded.
+        image_processor (`str` or [`BaseImageProcessor`], *optional*):
+            The image processor that will be used by the pipeline to preprocess images for the model. This can be a
+            model identifier or an actual image processor inheriting from [`BaseImageProcessor`].
+
+            Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal
+            models will also require a tokenizer to be passed.
+
+            If not provided, the default image processor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is
+            a string).
+        processor (`str` or [`ProcessorMixin`], *optional*):
+            The processor that will be used by the pipeline to preprocess data for the model. This can be a model
+            identifier or an actual processor inheriting from [`ProcessorMixin`].
+
+            Processors are used for multi-modal models that require multi-modal inputs, for example, a model that
+            requires both text and image inputs.
+
+            If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default processor for `config` is loaded (if it is a string).
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        revision (`str`, *optional*, defaults to `"main"`):
+            When passing a task name or a string model identifier: The specific model version to use. It can be a
+            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
+        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+            `device_map="auto"` to compute the most optimized `device_map` automatically (see
+            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+            for more information).
+
+            <Tip warning={true}>
+
+            Do not use `device_map` AND `device` at the same time as they will conflict
+
+            </Tip>
+
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+            and in which you have read the code, as it will execute code present on the Hub on your local machine.
+        model_kwargs (`Dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+            corresponding pipeline class for possible values).
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+    >>> # Sentiment analysis pipeline
+    >>> analyzer = pipeline("sentiment-analysis")
+
+    >>> # Question answering pipeline, specifying the checkpoint identifier
+    >>> oracle = pipeline(
+    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
+    ... )
+
+    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+    # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
+    # this is to keep BC).
+    use_auth_token = model_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    code_revision = kwargs.pop("code_revision", None)
+    commit_hash = kwargs.pop("_commit_hash", None)
+
+    hub_kwargs = {
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": commit_hash,
+    }
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+    if model is None and feature_extractor is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided"
+            " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class"
+            " or a path/identifier to a pretrained model when providing feature_extractor."
+        )
+    if isinstance(model, Path):
+        model = str(model)
+
+    if commit_hash is None:
+        pretrained_model_name_or_path = None
+        if isinstance(config, str):
+            pretrained_model_name_or_path = config
+        elif config is None and isinstance(model, str):
+            pretrained_model_name_or_path = model
+
+        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
+            # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+            resolved_config_file = cached_file(
+                pretrained_model_name_or_path,
+                CONFIG_NAME,
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                cache_dir=model_kwargs.get("cache_dir"),
+                **hub_kwargs,
+            )
+            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
+        else:
+            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
+
+    # Config is the primordial information item.
+    # Instantiate config if needed
+    adapter_path = None
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(
+            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+    elif config is None and isinstance(model, str):
+        # Check for an adapter file in the model path if PEFT is available
+        if is_peft_available():
+            # `find_adapter_config_file` doesn't accept `trust_remote_code`
+            _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"}
+            maybe_adapter_path = find_adapter_config_file(
+                model,
+                token=hub_kwargs["token"],
+                revision=hub_kwargs["revision"],
+                _commit_hash=hub_kwargs["_commit_hash"],
+            )
+
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+                    adapter_path = model
+                    model = adapter_config["base_model_name_or_path"]
+
+        config = AutoConfig.from_pretrained(
+            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+
+    custom_tasks = {}
+    if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
+        custom_tasks = config.custom_pipelines
+        if task is None and trust_remote_code is not False:
+            if len(custom_tasks) == 1:
+                task = list(custom_tasks.keys())[0]
+            else:
+                raise RuntimeError(
+                    "We can't infer the task automatically for this model as there are multiple tasks available. Pick "
+                    f"one in {', '.join(custom_tasks.keys())}"
+                )
+
+    if task is None and model is not None:
+        if not isinstance(model, str):
+            raise RuntimeError(
+                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
+                f"{model} is not a valid model_id."
+            )
+        task = get_task(model, token)
+
+    # Retrieve the task
+    if task in custom_tasks:
+        normalized_task = task
+        targeted_task, task_options = clean_custom_task(custom_tasks[task])
+        if pipeline_class is None:
+            if not trust_remote_code:
+                raise ValueError(
+                    "Loading this pipeline requires you to execute the code in the pipeline file in that"
+                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+                    " set the option `trust_remote_code=True` to remove this error."
+                )
+            class_ref = targeted_task["impl"]
+            pipeline_class = get_class_from_dynamic_module(
+                class_ref,
+                model,
+                code_revision=code_revision,
+                **hub_kwargs,
+            )
+    else:
+        normalized_task, targeted_task, task_options = check_task(task)
+        if pipeline_class is None:
+            pipeline_class = targeted_task["impl"]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        # At that point framework might still be undetermined
+        model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options)
+        revision = revision if revision is not None else default_revision
+        logger.warning(
+            f"No model was supplied, defaulted to {model} and revision"
+            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
+            "Using a pipeline without specifying a model name and revision in production is not recommended."
+        )
+        hub_kwargs["revision"] = revision
+        if config is None and isinstance(model, str):
+            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            hub_kwargs["_commit_hash"] = config._commit_hash
+
+    if device_map is not None:
+        if "device_map" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
+        model_kwargs["device_map"] = device_map
+    if torch_dtype is not None:
+        if "torch_dtype" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
+            torch_dtype = getattr(torch, torch_dtype)
+        model_kwargs["torch_dtype"] = torch_dtype
+
+    model_name = model if isinstance(model, str) else None
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str) or framework is None:
+        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+        framework, model = infer_framework_load_model(
+            adapter_path if adapter_path is not None else model,
+            model_classes=model_classes,
+            config=config,
+            framework=framework,
+            task=task,
+            **hub_kwargs,
+            **model_kwargs,
+        )
+
+    model_config = model.config
+    hub_kwargs["_commit_hash"] = model.config._commit_hash
+
+    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
+    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
+
+    # Check that pipeline class required loading
+    load_tokenizer = load_tokenizer and pipeline_class._load_tokenizer
+    load_feature_extractor = load_feature_extractor and pipeline_class._load_feature_extractor
+    load_image_processor = load_image_processor and pipeline_class._load_image_processor
+    load_processor = load_processor and pipeline_class._load_processor
+
+    # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
+    # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
+    # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
+    # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
+    # This block is only temporarily to make CI green.
+    if load_image_processor and load_feature_extractor:
+        load_feature_extractor = False
+
+    if (
+        tokenizer is None
+        and not load_tokenizer
+        and normalized_task not in NO_TOKENIZER_TASKS
+        # Using class name to avoid importing the real class.
+        and (
+            model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+            or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+        )
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_tokenizer = True
+    if (
+        image_processor is None
+        and not load_image_processor
+        and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_image_processor = True
+    if (
+        feature_extractor is None
+        and not load_feature_extractor
+        and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_feature_extractor = True
+
+    if task in NO_TOKENIZER_TASKS:
+        # These will never require a tokenizer.
+        # the model on the other hand might have a tokenizer, but
+        # the files could be missing from the hub, instead of failing
+        # on such repos, we just force to not load it.
+        load_tokenizer = False
+
+    if task in NO_FEATURE_EXTRACTOR_TASKS:
+        load_feature_extractor = False
+    if task in NO_IMAGE_PROCESSOR_TASKS:
+        load_image_processor = False
+
+    if load_tokenizer:
+        # Try to infer tokenizer from model or config name (if provided as str)
+        if tokenizer is None:
+            if isinstance(model_name, str):
+                tokenizer = model_name
+            elif isinstance(config, str):
+                tokenizer = config
+            else:
+                # Impossible to guess what is the right tokenizer here
+                raise Exception(
+                    "Impossible to guess which tokenizer to use. "
+                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+                )
+
+        # Instantiate tokenizer if needed
+        if isinstance(tokenizer, (str, tuple)):
+            if isinstance(tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                use_fast = tokenizer[1].pop("use_fast", use_fast)
+                tokenizer_identifier = tokenizer[0]
+                tokenizer_kwargs = tokenizer[1]
+            else:
+                tokenizer_identifier = tokenizer
+                tokenizer_kwargs = model_kwargs.copy()
+                tokenizer_kwargs.pop("torch_dtype", None)
+
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
+            )
+
+    if load_image_processor:
+        # Try to infer image processor from model or config name (if provided as str)
+        if image_processor is None:
+            if isinstance(model_name, str):
+                image_processor = model_name
+            elif isinstance(config, str):
+                image_processor = config
+            # Backward compatibility, as `feature_extractor` used to be the name
+            # for `ImageProcessor`.
+            elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
+                image_processor = feature_extractor
+            else:
+                # Impossible to guess what is the right image_processor here
+                raise Exception(
+                    "Impossible to guess which image processor to use. "
+                    "Please provide a PreTrainedImageProcessor class or a path/identifier "
+                    "to a pretrained image processor."
+                )
+
+        # Instantiate image_processor if needed
+        if isinstance(image_processor, (str, tuple)):
+            image_processor = AutoImageProcessor.from_pretrained(
+                image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
+    if load_feature_extractor:
+        # Try to infer feature extractor from model or config name (if provided as str)
+        if feature_extractor is None:
+            if isinstance(model_name, str):
+                feature_extractor = model_name
+            elif isinstance(config, str):
+                feature_extractor = config
+            else:
+                # Impossible to guess what is the right feature_extractor here
+                raise Exception(
+                    "Impossible to guess which feature extractor to use. "
+                    "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
+                    "to a pretrained feature extractor."
+                )
+
+        # Instantiate feature_extractor if needed
+        if isinstance(feature_extractor, (str, tuple)):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
+            if (
+                feature_extractor._processor_class
+                and feature_extractor._processor_class.endswith("WithLM")
+                and isinstance(model_name, str)
+            ):
+                try:
+                    import kenlm  # to trigger `ImportError` if not installed
+                    from pyctcdecode import BeamSearchDecoderCTC
+
+                    if os.path.isdir(model_name) or os.path.isfile(model_name):
+                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+                    else:
+                        language_model_glob = os.path.join(
+                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+                        )
+                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+                        allow_patterns = [language_model_glob, alphabet_filename]
+                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)
+
+                    kwargs["decoder"] = decoder
+                except ImportError as e:
+                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+                    if not is_kenlm_available():
+                        logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                    if not is_pyctcdecode_available():
+                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
+
+    if load_processor:
+        # Try to infer processor from model or config name (if provided as str)
+        if processor is None:
+            if isinstance(model_name, str):
+                processor = model_name
+            elif isinstance(config, str):
+                processor = config
+            else:
+                # Impossible to guess what is the right processor here
+                raise Exception(
+                    "Impossible to guess which processor to use. "
+                    "Please provide a processor instance or a path/identifier "
+                    "to a processor."
+                )
+
+        # Instantiate processor if needed
+        if isinstance(processor, (str, tuple)):
+            processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            if not isinstance(processor, ProcessorMixin):
+                raise TypeError(
+                    "Processor was loaded, but it is not an instance of `ProcessorMixin`. "
+                    f"Got type `{type(processor)}` instead. Please check that you specified "
+                    "correct pipeline task for the model and model has processor implemented and saved."
+                )
+
+    if task == "translation" and model.config.task_specific_params:
+        for key in model.config.task_specific_params:
+            if key.startswith("translation"):
+                task = key
+                warnings.warn(
+                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
+                    UserWarning,
+                )
+                break
+
+    if tokenizer is not None:
+        kwargs["tokenizer"] = tokenizer
+
+    if feature_extractor is not None:
+        kwargs["feature_extractor"] = feature_extractor
+
+    if torch_dtype is not None:
+        kwargs["torch_dtype"] = torch_dtype
+
+    if image_processor is not None:
+        kwargs["image_processor"] = image_processor
+
+    if device is not None:
+        kwargs["device"] = device
+
+    if processor is not None:
+        kwargs["processor"] = processor
+
+    return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/docs/transformers/build/lib/transformers/pipelines/audio_classification.py b/docs/transformers/build/lib/transformers/pipelines/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..86dfe72d3c5dc1aefe9f56da720dfe7ca6353d4a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/audio_classification.py
@@ -0,0 +1,244 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
+class AudioClassificationPipeline(Pipeline):
+    """
+    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
+    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+    formats.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
+    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"audio-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        # Only set default top_k if explicitly provided
+        if "top_k" in kwargs and kwargs["top_k"] is None:
+            kwargs["top_k"] = None
+        elif "top_k" not in kwargs:
+            kwargs["top_k"] = 5
+        super().__init__(*args, **kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
+        information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
+            top_k (`int`, *optional*, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                higher than the number of labels available in the model configuration, it will default to the number of
+                labels.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of `dict` with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs):
+        postprocess_params = {}
+
+        # If top_k is None, use all labels
+        if top_k is None:
+            postprocess_params["top_k"] = self.model.config.num_labels
+        else:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+            postprocess_params["top_k"] = top_k
+
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return {}, {}, postprocess_params
+
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        if isinstance(inputs, dict):
+            inputs = inputs.copy()  # So we don't mutate the original dictionary outside the pipeline
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                import torch
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+        processed = self.feature_extractor(
+            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        if self.torch_dtype is not None:
+            processed = processed.to(dtype=self.torch_dtype)
+        return processed
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if function_to_apply == "softmax":
+            probs = model_outputs.logits[0].softmax(-1)
+        elif function_to_apply == "sigmoid":
+            probs = model_outputs.logits[0].sigmoid()
+        else:
+            probs = model_outputs.logits[0]
+        scores, ids = probs.topk(top_k)
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+
+        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+        return labels
diff --git a/docs/transformers/build/lib/transformers/pipelines/audio_utils.py b/docs/transformers/build/lib/transformers/pipelines/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a5f51db6129ae46939a5f2d640d286f479749f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/audio_utils.py
@@ -0,0 +1,297 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+import datetime
+import platform
+import subprocess
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
+            output_stream = ffmpeg_process.communicate(bpayload)
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError(
+            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
+            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
+            "URL, ensure that the URL is the full address to **download** the audio file."
+        )
+    return audio
+
+
+def ffmpeg_microphone(
+    sampling_rate: int,
+    chunk_length_s: float,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
+    input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
+    'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned.
+        format_for_conversion (`str`, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Returns:
+        A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
+        `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    if format_for_conversion == "s16le":
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    system = platform.system()
+
+    if system == "Linux":
+        format_ = "alsa"
+        input_ = ffmpeg_input_device or "default"
+    elif system == "Darwin":
+        format_ = "avfoundation"
+        input_ = ffmpeg_input_device or ":default"
+    elif system == "Windows":
+        format_ = "dshow"
+        input_ = ffmpeg_input_device or _get_microphone_name()
+
+    ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
+
+    ffmpeg_command = [
+        "ffmpeg",
+        "-f",
+        format_,
+        "-i",
+        input_,
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-fflags",
+        "nobuffer",
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    ffmpeg_command.extend(ffmpeg_additional_args)
+
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
+    for item in iterator:
+        yield item
+
+
+def ffmpeg_microphone_live(
+    sampling_rate: int,
+    chunk_length_s: float,
+    stream_chunk_s: Optional[int] = None,
+    stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
+    from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
+    errors on the "sides" of the various chunks. The default input device will be used unless another input device is
+    specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
+        stream_chunk_s (`float` or `int`):
+            The length of the minimal temporary audio to be returned.
+        stride_length_s (`float` or `int` or `(float, float)`, *optional*):
+            The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
+            an audio sample but without using that part to actually make the prediction. Setting this does not change
+            the length of the chunk.
+        format_for_conversion (`str`, *optional*, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Return:
+        A generator yielding dictionaries of the following form
+
+        `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
+        `stride_length_s` is defined.
+
+        `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
+        is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
+    """
+    if stream_chunk_s is not None:
+        chunk_s = stream_chunk_s
+    else:
+        chunk_s = chunk_length_s
+
+    microphone = ffmpeg_microphone(
+        sampling_rate,
+        chunk_s,
+        format_for_conversion=format_for_conversion,
+        ffmpeg_input_device=ffmpeg_input_device,
+        ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
+    )
+
+    if format_for_conversion == "s16le":
+        dtype = np.int16
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        dtype = np.float32
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    if stride_length_s is None:
+        stride_length_s = chunk_length_s / 6
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    if isinstance(stride_length_s, (int, float)):
+        stride_length_s = [stride_length_s, stride_length_s]
+
+    stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
+    stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
+    audio_time = datetime.datetime.now()
+    delta = datetime.timedelta(seconds=chunk_s)
+    for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
+        # Put everything back in numpy scale
+        item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
+        item["stride"] = (
+            item["stride"][0] // size_of_sample,
+            item["stride"][1] // size_of_sample,
+        )
+        item["sampling_rate"] = sampling_rate
+        audio_time += delta
+        if datetime.datetime.now() > audio_time + 10 * delta:
+            # We're late !! SKIP
+            continue
+        yield item
+
+
+def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
+    """
+    Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
+    get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
+    """
+    acc = b""
+    stride_left, stride_right = stride
+    if stride_left + stride_right >= chunk_len:
+        raise ValueError(
+            f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
+        )
+    _stride_left = 0
+    for raw in iterator:
+        acc += raw
+        if stream and len(acc) < chunk_len:
+            stride = (_stride_left, 0)
+            yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
+        else:
+            while len(acc) >= chunk_len:
+                # We are flushing the accumulator
+                stride = (_stride_left, stride_right)
+                item = {"raw": acc[:chunk_len], "stride": stride}
+                if stream:
+                    item["partial"] = False
+                yield item
+                _stride_left = stride_left
+                acc = acc[chunk_len - stride_left - stride_right :]
+    # Last chunk
+    if len(acc) > stride_left:
+        item = {"raw": acc, "stride": (_stride_left, 0)}
+        if stream:
+            item["partial"] = False
+        yield item
+
+
+def _ffmpeg_stream(ffmpeg_command, buflen: int):
+    """
+    Internal function to create the generator of data through ffmpeg
+    """
+    bufsize = 2**24  # 16Mo
+    try:
+        with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
+            while True:
+                raw = ffmpeg_process.stdout.read(buflen)
+                if raw == b"":
+                    break
+                yield raw
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
+
+
+def _get_microphone_name():
+    """
+    Retrieve the microphone name in Windows .
+    """
+    command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
+
+    try:
+        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
+        microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
+
+        if microphone_lines:
+            microphone_name = microphone_lines[0].split('"')[1]
+            print(f"Using microphone: {microphone_name}")
+            return f"audio={microphone_name}"
+    except FileNotFoundError:
+        print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
+
+    return "default"
diff --git a/docs/transformers/build/lib/transformers/pipelines/automatic_speech_recognition.py b/docs/transformers/build/lib/transformers/pipelines/automatic_speech_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a9c49ea5f3516053fa9d5835109dcd53e3ff1a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/automatic_speech_recognition.py
@@ -0,0 +1,766 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Optional, Union
+
+import numpy as np
+import requests
+
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import is_torch_available, is_torchaudio_available, logging
+from .audio_utils import ffmpeg_read
+from .base import ChunkPipeline
+
+
+if TYPE_CHECKING:
+    from pyctcdecode import BeamSearchDecoderCTC
+
+    from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from ..modeling_utils import PreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+
+
+def rescale_stride(stride, ratio):
+    """
+    Rescales the stride values from audio space to tokens/logits space.
+
+    (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
+    """
+    # Shape is [B, SEQ] for tokens
+    # [B, SEQ, V] for logits
+
+    new_strides = []
+    for input_n, left, right in stride:
+        token_n = int(round(input_n * ratio))
+        left = int(round(left / input_n * token_n))
+        right = int(round(right / input_n * token_n))
+        new_stride = (token_n, left, right)
+        new_strides.append(new_stride)
+
+    return new_strides
+
+
+def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
+    inputs_len = inputs.shape[0]
+    step = chunk_len - stride_left - stride_right
+    for chunk_start_idx in range(0, inputs_len, step):
+        chunk_end_idx = chunk_start_idx + chunk_len
+        chunk = inputs[chunk_start_idx:chunk_end_idx]
+        processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+        if dtype is not None:
+            processed = processed.to(dtype=dtype)
+        _stride_left = 0 if chunk_start_idx == 0 else stride_left
+        is_last = chunk_end_idx >= inputs_len
+        _stride_right = 0 if is_last else stride_right
+
+        chunk_len = chunk.shape[0]
+        stride = (chunk_len, _stride_left, _stride_right)
+        if chunk.shape[0] > _stride_left:
+            yield {"is_last": is_last, "stride": stride, **processed}
+        if is_last:
+            break
+
+
+def _fast_find_longest_common_sequence(sequence_left, sequence_right):
+    seq_len_left = len(sequence_left)
+    seq_len_right = len(sequence_right)
+    counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
+    longest = 0
+    for i in range(seq_len_left):
+        for j in range(seq_len_right):
+            if sequence_left[i] == sequence_right[j]:
+                previous_counter = counter[i][j] + 1
+                counter[i + 1][j + 1] = previous_counter
+                if previous_counter > longest:
+                    longest = previous_counter
+
+    counter = np.array(counter)
+    # we return the idx of the first element of the longest common sequence in the left sequence
+    index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
+    index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
+    return index_left, index_right, longest
+
+
+def _find_longest_common_sequence(sequences, tokenizer):
+    # TODO  Use a faster algorithm this can probably be done in O(n)
+    # using suffix array.
+    # It might be tedious to do because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # Also the algorithm should be more tolerant to errors.
+    sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
+    for new_seq in sequences[1:]:
+        new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
+
+        index = 0
+        max_ = 0.0
+        for i in range(1, len(new_sequence) + 1):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+            matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                index = i
+                max_ = matching
+        sequence.extend(new_sequence[index:])
+    return np.array(sequence)
+
+
+class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
+    """
+    Pipeline that aims at extracting spoken text contained within some audio.
+
+    The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
+    to support multiple audio formats
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> transcriber = pipeline(model="openai/whisper-base")
+    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode waveform for the model.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`].
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
+            [PyCTCDecode's
+            BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
+            can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
+        chunk_length_s (`float`, *optional*, defaults to 0):
+            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
+
+            <Tip>
+
+            For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+            The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+            the model to *see* more context and infer letters better than without this context but the pipeline
+            discards the stride bits at the end to make the final reconstitution as perfect as possible.
+
+            <Tip>
+
+            For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed. If no framework is specified, will default to the one currently installed. If no framework is
+            specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
+            no model is provided.
+        device (Union[`int`, `torch.device`], *optional*):
+            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+            model on the associated CUDA device id.
+        torch_dtype (Union[`int`, `torch.dtype`], *optional*):
+            The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to
+            `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes.
+
+    """
+
+    def __init__(
+        self,
+        model: "PreTrainedModel",
+        feature_extractor: Union["SequenceFeatureExtractor", str] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
+        device: Union[int, "torch.device"] = None,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        **kwargs,
+    ):
+        # set the model type so we can check we have the right pre- and post-processing parameters
+        if model.config.model_type == "whisper":
+            self.type = "seq2seq_whisper"
+        elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
+            self.type = "seq2seq"
+        elif (
+            feature_extractor._processor_class
+            and feature_extractor._processor_class.endswith("WithLM")
+            and decoder is not None
+        ):
+            self.decoder = decoder
+            self.type = "ctc_with_lm"
+        else:
+            self.type = "ctc"
+
+        super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
+        """
+        Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
+        documentation for more information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is either the filename of a local audio file, or a public URL address to download the
+                      audio file. The file will be read at the correct sampling rate to get the waveform using
+                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+                      inference to provide more context to the model). Only use `stride` with CTC models.
+            return_timestamps (*optional*, `str` or `bool`):
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                other sequence-to-sequence models.
+
+                For CTC models, timestamps can take one of two formats:
+                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
+                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
+
+                For the Whisper model, timestamps can take one of two formats:
+                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
+                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str`): The recognized text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(
+        self,
+        chunk_length_s=None,
+        stride_length_s=None,
+        ignore_warning=None,
+        decoder_kwargs=None,
+        return_timestamps=None,
+        return_language=None,
+        generate_kwargs=None,
+        max_new_tokens=None,
+    ):
+        # No parameters on this pipeline right now
+        preprocess_params = {}
+        if chunk_length_s is not None:
+            if self.type == "seq2seq" and not ignore_warning:
+                logger.warning(
+                    "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
+                    " be entirely accurate and will have caveats. More information:"
+                    " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
+                    " ignore_warning=True)"
+                )
+            preprocess_params["chunk_length_s"] = chunk_length_s
+        if stride_length_s is not None:
+            preprocess_params["stride_length_s"] = stride_length_s
+
+        forward_params = defaultdict(dict)
+        if max_new_tokens is not None:
+            warnings.warn(
+                "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.",
+                FutureWarning,
+            )
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        postprocess_params = {}
+        if decoder_kwargs is not None:
+            postprocess_params["decoder_kwargs"] = decoder_kwargs
+        if return_timestamps is not None:
+            # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
+            if self.type == "seq2seq" and return_timestamps:
+                raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+            if self.type == "ctc_with_lm" and return_timestamps != "word":
+                raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+            if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+                raise ValueError(
+                    "CTC can either predict character level timestamps, or word level timestamps. "
+                    "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+                )
+            if self.type == "seq2seq_whisper" and return_timestamps == "char":
+                raise ValueError(
+                    "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+                    "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+                )
+            forward_params["return_timestamps"] = return_timestamps
+            postprocess_params["return_timestamps"] = return_timestamps
+        if return_language is not None:
+            if self.type != "seq2seq_whisper":
+                raise ValueError("Only Whisper can return language for now.")
+            postprocess_params["return_language"] = return_language
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        stride = None
+        extra = {}
+        if isinstance(inputs, dict):
+            stride = inputs.pop("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            extra = inputs
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+            if stride is not None:
+                if stride[0] + stride[1] > inputs.shape[0]:
+                    raise ValueError("Stride is too large for input")
+
+                # Stride needs to get the chunk length here, it's going to get
+                # swallowed by the `feature_extractor` later, and then batching
+                # can add extra data in the inputs, so we need to keep track
+                # of the original length in the stride so we can cut properly.
+                stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+
+            # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
+            # Currently chunking is not possible at this level for `seq2seq` so
+            # it's ok.
+            align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+            chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
+
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+
+            for item in chunk_iter(
+                inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
+            ):
+                yield {**item, **extra}
+        else:
+            if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
+                processed = self.feature_extractor(
+                    inputs,
+                    sampling_rate=self.feature_extractor.sampling_rate,
+                    truncation=False,
+                    padding="longest",
+                    return_tensors="pt",
+                    return_attention_mask=True,
+                )
+            else:
+                if self.type == "seq2seq_whisper" and stride is None:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_token_timestamps=True,
+                        return_attention_mask=True,
+                    )
+                    extra["num_frames"] = processed.pop("num_frames")
+                else:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_attention_mask=True,
+                    )
+            if self.torch_dtype is not None:
+                processed = processed.to(dtype=self.torch_dtype)
+            if stride is not None:
+                if self.type == "seq2seq":
+                    raise ValueError("Stride is only usable with CTC models, try removing it !")
+
+                processed["stride"] = stride
+            yield {"is_last": True, **processed, **extra}
+
+    def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
+        attention_mask = model_inputs.pop("attention_mask", None)
+        stride = model_inputs.pop("stride", None)
+        num_frames = model_inputs.pop("num_frames", None)
+        is_last = model_inputs.pop("is_last")
+
+        if stride is not None and num_frames is not None:
+            raise ValueError("num_frames must be used only when stride is None")
+
+        if self.type in {"seq2seq", "seq2seq_whisper"}:
+            # Consume values so we can let extra information flow freely through
+            # the pipeline (important for `partial` in microphone)
+            if "input_features" in model_inputs:
+                inputs = model_inputs.pop("input_features")
+            elif "input_values" in model_inputs:
+                inputs = model_inputs.pop("input_values")
+            else:
+                raise ValueError(
+                    "Seq2Seq speech recognition model requires either a "
+                    f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
+                )
+
+            # custom processing for Whisper timestamps and word-level timestamps
+            if return_timestamps and self.type == "seq2seq_whisper":
+                generate_kwargs["return_timestamps"] = return_timestamps
+                if return_timestamps == "word":
+                    generate_kwargs["return_token_timestamps"] = True
+                    generate_kwargs["return_segments"] = True
+
+                    if stride is not None:
+                        if isinstance(stride, tuple):
+                            generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
+                        else:
+                            generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
+                    else:
+                        generate_kwargs["num_frames"] = num_frames
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            tokens = self.model.generate(
+                inputs=inputs,
+                attention_mask=attention_mask,
+                **generate_kwargs,
+            )
+            # whisper longform generation stores timestamps in "segments"
+            if return_timestamps == "word" and self.type == "seq2seq_whisper":
+                if "segments" not in tokens:
+                    out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+                else:
+                    token_timestamps = [
+                        torch.cat([segment["token_timestamps"] for segment in segment_list])
+                        for segment_list in tokens["segments"]
+                    ]
+                    out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
+            else:
+                out = {"tokens": tokens}
+            if self.type == "seq2seq_whisper":
+                if stride is not None:
+                    out["stride"] = stride
+
+        else:
+            inputs = {
+                self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
+                "attention_mask": attention_mask,
+            }
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+
+            if self.type == "ctc_with_lm":
+                out = {"logits": logits}
+            else:
+                out = {"tokens": logits.argmax(dim=-1)}
+            if stride is not None:
+                # Send stride to `postprocess`.
+                # it needs to be handled there where
+                # the pieces are to be concatenated.
+                ratio = 1 / self.model.config.inputs_to_logits_ratio
+                if isinstance(stride, tuple):
+                    out["stride"] = rescale_stride([stride], ratio)[0]
+                else:
+                    out["stride"] = rescale_stride(stride, ratio)
+        # Leftover
+        extra = model_inputs
+        return {"is_last": is_last, **out, **extra}
+
+    def postprocess(
+        self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None
+    ):
+        # Optional return types
+        optional = {}
+
+        final_items = []
+        key = "logits" if self.type == "ctc_with_lm" else "tokens"
+        stride = None
+        for outputs in model_outputs:
+            if self.framework == "pt" and outputs[key].dtype in (torch.bfloat16, torch.float16):
+                items = outputs[key].to(torch.float32).numpy()
+            else:
+                items = outputs[key].numpy()
+            stride = outputs.get("stride", None)
+            if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
+                total_n, left, right = stride
+                # Total_n might be < logits.shape[1]
+                # because of padding, that's why
+                # we need to reconstruct this information
+                # This won't work with left padding (which doesn't exist right now)
+                right_n = total_n - right
+                items = items[:, left:right_n]
+            final_items.append(items)
+
+        if stride and self.type == "seq2seq":
+            items = _find_longest_common_sequence(final_items, self.tokenizer)
+        elif self.type == "seq2seq_whisper":
+            time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+            # Send the chunking back to seconds, it's easier to handle in whisper
+            sampling_rate = self.feature_extractor.sampling_rate
+            for output in model_outputs:
+                if "stride" in output:
+                    chunk_len, stride_left, stride_right = output["stride"]
+                    # Go back in seconds
+                    chunk_len /= sampling_rate
+                    stride_left /= sampling_rate
+                    stride_right /= sampling_rate
+                    output["stride"] = chunk_len, stride_left, stride_right
+
+            text, optional = self.tokenizer._decode_asr(
+                model_outputs,
+                return_timestamps=return_timestamps,
+                return_language=return_language,
+                time_precision=time_precision,
+            )
+        else:
+            items = np.concatenate(final_items, axis=1)
+            items = items.squeeze(0)
+
+        if self.type == "ctc_with_lm":
+            if decoder_kwargs is None:
+                decoder_kwargs = {}
+            beams = self.decoder.decode_beams(items, **decoder_kwargs)
+            text = beams[0][0]
+            if return_timestamps:
+                # Simply cast from pyctcdecode format to wav2vec2 format to leverage
+                # pre-existing code later
+                chunk_offset = beams[0][2]
+                offsets = []
+                for word, (start_offset, end_offset) in chunk_offset:
+                    offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+        elif self.type != "seq2seq_whisper":
+            skip_special_tokens = self.type != "ctc"
+            text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
+            if return_timestamps:
+                offsets = self.tokenizer.decode(
+                    items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
+                )["char_offsets"]
+                if return_timestamps == "word":
+                    offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
+
+        if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
+            chunks = []
+            for item in offsets:
+                start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
+                start /= self.feature_extractor.sampling_rate
+
+                stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio
+                stop /= self.feature_extractor.sampling_rate
+
+                chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})
+            optional["chunks"] = chunks
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            output.pop("tokens", None)
+            output.pop("logits", None)
+            output.pop("is_last", None)
+            output.pop("stride", None)
+            output.pop("token_timestamps", None)
+            for k, v in output.items():
+                extra[k].append(v)
+        return {"text": text, **optional, **extra}
+
+
+def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
+    """
+    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
+    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
+    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
+    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
+    properly compute the final `offset`.
+    """
+    # index of the first timestamp token
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    items = []
+    # approximation of the token to time ratio : ~0.2seconds
+    time_precision = feature_extractor.chunk_length / max_source_positions
+    time = 0
+    for seq_idx, item in enumerate(sequences):
+        sequence, stride = item
+        if isinstance(sequence, list):
+            sequence = np.array(sequence)
+        chunk_len, stride_left, stride_right = stride
+        sequence = sequence.squeeze(0)
+        # get rid of the `forced_decoder_idx` that are use to parametrize the generation
+        begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
+        sequence = sequence[begin_idx:]
+
+        timestamp_tokens = sequence >= timestamp_begin
+        if seq_idx != 0 and sum(timestamp_tokens) > 0:
+            consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
+            time -= stride_left + stride_right
+            offset = int((time / feature_extractor.sampling_rate) / time_precision)
+            overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
+            # relevant timestamps are in the overlapping part
+            relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
+            if relevant_timestamp.shape[0] > 0:
+                relevant_timestamp = (
+                    consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
+                )
+                # if a big stride is used, we need to check some of the previous items for the best overlap
+                best_match = 0
+                sliced_sequence = []
+                for idx, previous_sequence in enumerate(reversed(items)):
+                    previous_tokens = previous_sequence[1:-1]
+                    if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
+                        break  # the previous sequence is too far in the past
+                    if len(previous_tokens) > 0:
+                        # find the longest common sequence between the overlapping parts
+                        index_left, index_right, match_length = _fast_find_longest_common_sequence(
+                            sequence[1:relevant_timestamp], previous_tokens
+                        )
+                        # don't do anything if only 1 token was matched
+                        if match_length > 1 and match_length > best_match:
+                            best_match = match_length
+                            best_idx = idx
+                            end_of_curr_sequence_idx = (
+                                np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
+                            )
+                            end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
+                            # if all the tokens are matched, suffix
+                            if index_left == 0 and match_length == len(previous_tokens):
+                                sliced_sequence = np.insert(
+                                    sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
+                                )
+                                sliced_sequence[-1] = previous_sequence[-1]
+                            # if part of the previous sequence is not taken
+                            elif index_left >= 0:
+                                sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
+                                # let's insert the missing part of the previous sequence
+                                previous_slice = (
+                                    previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
+                                )
+                                sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
+                                sliced_sequence[-1] += offset
+
+                if len(sliced_sequence) > 0:
+                    items[len(items) - best_idx - 1] = sliced_sequence
+                    items = items[: len(items) - best_idx]
+                    sequence = sequence[end_of_curr_sequence_idx:]
+
+        # sequence might have changed
+        timestamp_tokens = sequence >= timestamp_begin
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if sum(timestamp_tokens) > 0:
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = (
+                np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
+            )
+
+        if len(consecutive) > 0:
+            last_slice = 0
+            for current_slice in consecutive:
+                actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
+                sliced_tokens = sequence[last_slice:current_slice]
+                duration = sliced_tokens[-1] - sliced_tokens[0]
+                sliced_tokens[0] = actual_offset
+                sliced_tokens[-1] = actual_offset + duration
+                items.append(sliced_tokens)
+                last_slice = current_slice
+
+        time += chunk_len
+    result = []
+    for i in range(len(items)):
+        result += items[i].tolist()
+    return result
diff --git a/docs/transformers/build/lib/transformers/pipelines/base.py b/docs/transformers/build/lib/transformers/pipelines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a99069399da6fa7736b8e13b703dfb4190e35c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/base.py
@@ -0,0 +1,1520 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import copy
+import csv
+import importlib
+import json
+import os
+import pickle
+import sys
+import traceback
+import types
+import warnings
+from abc import ABC, abstractmethod
+from collections import UserDict
+from contextlib import contextmanager
+from os.path import abspath, exists
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from ..dynamic_module_utils import custom_object_save
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..modelcard import ModelCard
+from ..models.auto import AutoConfig, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    ModelOutput,
+    PushToHubMixin,
+    add_end_docstrings,
+    copy_func,
+    infer_framework,
+    is_tf_available,
+    is_torch_available,
+    is_torch_cuda_available,
+    is_torch_hpu_available,
+    is_torch_mlu_available,
+    is_torch_mps_available,
+    is_torch_musa_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+    logging,
+)
+
+
+GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TFAutoModel
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import DataLoader, Dataset
+
+    from ..modeling_utils import PreTrainedModel
+    from ..models.auto.modeling_auto import AutoModel
+
+    # Re-export for backward compatibility
+    from .pt_utils import KeyDataset
+else:
+    Dataset = None
+    KeyDataset = None
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+def no_collate_fn(items):
+    if len(items) != 1:
+        raise ValueError("This collate_fn is meant to be used with batch_size=1")
+    return items[0]
+
+
+def _pad(items, key, padding_value, padding_side):
+    batch_size = len(items)
+    if isinstance(items[0][key], torch.Tensor):
+        # Others include `attention_mask` etc...
+        shape = items[0][key].shape
+        dim = len(shape)
+        if dim == 1:
+            # We have a list of 1-dim torch tensors, which can be stacked without padding
+            return torch.cat([item[key] for item in items], dim=0)
+        if key in ["pixel_values", "image"]:
+            # This is probable image so padding shouldn't be necessary
+            # B, C, H, W
+            return torch.cat([item[key] for item in items], dim=0)
+        elif dim == 4 and key == "input_features":
+            # this is probably a mel spectrogram batched
+            return torch.cat([item[key] for item in items], dim=0)
+        max_length = max(item[key].shape[1] for item in items)
+        min_length = min(item[key].shape[1] for item in items)
+        dtype = items[0][key].dtype
+
+        if dim == 2:
+            if max_length == min_length:
+                # Bypass for `ImageGPT` which doesn't provide a padding value, yet
+                # we can consistently pad since the size should be matching
+                return torch.cat([item[key] for item in items], dim=0)
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        elif dim == 3:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        elif dim == 4:
+            tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
+
+        for i, item in enumerate(items):
+            if dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0])] = item[key][0].clone()
+            elif dim == 3:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :] = item[key][0].clone()
+            elif dim == 4:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
+
+        return tensor
+    else:
+        return [item[key] for item in items]
+
+
+def pad_collate_fn(tokenizer, feature_extractor):
+    # Tokenizer
+    t_padding_side = None
+    # Feature extractor
+    f_padding_side = None
+    if tokenizer is None and feature_extractor is None:
+        raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
+    if tokenizer is not None:
+        if tokenizer.pad_token_id is None:
+            raise ValueError(
+                "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
+                "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
+            )
+        else:
+            t_padding_value = tokenizer.pad_token_id
+            t_padding_side = tokenizer.padding_side
+    if feature_extractor is not None:
+        # Feature extractor can be images, where no padding is expected
+        f_padding_value = getattr(feature_extractor, "padding_value", None)
+        f_padding_side = getattr(feature_extractor, "padding_side", None)
+
+    if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
+        raise ValueError(
+            f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
+        )
+    padding_side = "right"
+    if t_padding_side is not None:
+        padding_side = t_padding_side
+    if f_padding_side is not None:
+        padding_side = f_padding_side
+
+    def inner(items):
+        keys = set(items[0].keys())
+        for item in items:
+            if set(item.keys()) != keys:
+                raise ValueError(
+                    f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
+                    f" {keys})"
+                )
+        # input_values, input_pixels, input_ids, ...
+        padded = {}
+        for key in keys:
+            if key in {"input_ids"}:
+                # ImageGPT uses a feature extractor
+                if tokenizer is None and feature_extractor is not None:
+                    _padding_value = f_padding_value
+                else:
+                    _padding_value = t_padding_value
+            elif key in {"input_values", "pixel_values", "input_features"}:
+                _padding_value = f_padding_value
+            elif key in {"p_mask", "special_tokens_mask"}:
+                _padding_value = 1
+            elif key in {"attention_mask", "token_type_ids"}:
+                _padding_value = 0
+            else:
+                # This is likely another random key maybe even user provided
+                _padding_value = 0
+            padded[key] = _pad(items, key, _padding_value, padding_side)
+        return padded
+
+    return inner
+
+
+def infer_framework_load_model(
+    model,
+    config: AutoConfig,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        config ([`AutoConfig`]):
+            The config associated with the model to help using the correct class
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        model_kwargs["_from_pipeline"] = task
+        class_tuple = ()
+        look_pt = is_torch_available() and framework in {"pt", None}
+        look_tf = is_tf_available() and framework in {"tf", None}
+        if model_classes:
+            if look_pt:
+                class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
+            if look_tf:
+                class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))
+        if config.architectures:
+            classes = []
+            for architecture in config.architectures:
+                transformers_module = importlib.import_module("transformers")
+                if look_pt:
+                    _class = getattr(transformers_module, architecture, None)
+                    if _class is not None:
+                        classes.append(_class)
+                if look_tf:
+                    _class = getattr(transformers_module, f"TF{architecture}", None)
+                    if _class is not None:
+                        classes.append(_class)
+            class_tuple = class_tuple + tuple(classes)
+
+        if len(class_tuple) == 0:
+            raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
+
+        all_traceback = {}
+        for model_class in class_tuple:
+            kwargs = model_kwargs.copy()
+            if framework == "pt" and model.endswith(".h5"):
+                kwargs["from_tf"] = True
+                logger.warning(
+                    "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+                    "Trying to load the model with PyTorch."
+                )
+            elif framework == "tf" and model.endswith(".bin"):
+                kwargs["from_pt"] = True
+                logger.warning(
+                    "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+                    "Trying to load the model with Tensorflow."
+                )
+
+            try:
+                model = model_class.from_pretrained(model, **kwargs)
+                if hasattr(model, "eval"):
+                    model = model.eval()
+                # Stop loading on the first successful load.
+                break
+            except (OSError, ValueError):
+                all_traceback[model_class.__name__] = traceback.format_exc()
+                continue
+
+        if isinstance(model, str):
+            error = ""
+            for class_name, trace in all_traceback.items():
+                error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
+            raise ValueError(
+                f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
+            )
+
+    if framework is None:
+        framework = infer_framework(model.__class__)
+    return framework, model
+
+
+def infer_framework_from_model(
+    model,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if isinstance(model, str):
+        config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
+    else:
+        config = model.config
+    return infer_framework_load_model(
+        model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
+    )
+
+
+def get_framework(model, revision: Optional[str] = None):
+    """
+    Select framework (TensorFlow or PyTorch) to use.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
+            the model name). If no specific model is provided, defaults to using PyTorch.
+    """
+    warnings.warn(
+        "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
+        FutureWarning,
+    )
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        if is_torch_available() and not is_tf_available():
+            model = AutoModel.from_pretrained(model, revision=revision)
+        elif is_tf_available() and not is_torch_available():
+            model = TFAutoModel.from_pretrained(model, revision=revision)
+        else:
+            try:
+                model = AutoModel.from_pretrained(model, revision=revision)
+            except OSError:
+                model = TFAutoModel.from_pretrained(model, revision=revision)
+
+    framework = infer_framework(model.__class__)
+    return framework
+
+
+def get_default_model_and_revision(
+    targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]
+) -> Tuple[str, str]:
+    """
+    Select a default model to use for a given task. Defaults to pytorch if ambiguous.
+
+    Args:
+        targeted_task (`Dict`):
+           Dictionary representing the given task, that should contain default models
+
+        framework (`str`, None)
+           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
+
+        task_options (`Any`, None)
+           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
+           translation task.
+
+    Returns
+
+        Tuple:
+            - `str` The model string representing the default model for this pipeline.
+            - `str` The revision of the model.
+    """
+    if is_torch_available() and not is_tf_available():
+        framework = "pt"
+    elif is_tf_available() and not is_torch_available():
+        framework = "tf"
+
+    defaults = targeted_task["default"]
+    if task_options:
+        if task_options not in defaults:
+            raise ValueError(f"The task does not provide any default models for options {task_options}")
+        default_models = defaults[task_options]["model"]
+    elif "model" in defaults:
+        default_models = targeted_task["default"]["model"]
+    else:
+        # XXX This error message needs to be updated to be more generic if more tasks are going to become
+        # parametrized
+        raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')
+
+    if framework is None:
+        framework = "pt"
+
+    return default_models[framework]
+
+
+def load_assistant_model(
+    model: "PreTrainedModel",
+    assistant_model: Optional[Union[str, "PreTrainedModel"]],
+    assistant_tokenizer: Optional[PreTrainedTokenizer],
+) -> Tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]:
+    """
+    Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The main model that will be used by the pipeline to make predictions.
+        assistant_model (`str` or [`PreTrainedModel`], *optional*):
+            The assistant model that will be used by the pipeline to make predictions.
+        assistant_tokenizer ([`PreTrainedTokenizer`], *optional*):
+            The assistant tokenizer that will be used by the pipeline to encode data for the model.
+
+    Returns:
+        Tuple: The loaded assistant model and (optionally) the loaded tokenizer.
+    """
+    if not model.can_generate() or assistant_model is None:
+        return None, None
+
+    if getattr(model, "framework") != "pt" or not isinstance(model, PreTrainedModel):
+        raise ValueError(
+            "Assisted generation, triggered by the `assistant_model` argument, is only available for "
+            "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported."
+        )
+
+    # If the model is passed as a string, load the model and the corresponding tokenizer
+    if isinstance(assistant_model, str):
+        assistant_config = AutoConfig.from_pretrained(assistant_model)
+        _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config)
+        loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype)
+        loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model)
+    else:
+        loaded_assistant_model = assistant_model
+        loaded_assistant_tokenizer = assistant_tokenizer
+
+    # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant
+    # tokenizer
+    same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size
+    same_special_tokens = all(
+        getattr(model.config, token) == getattr(loaded_assistant_model.config, token)
+        for token in ("eos_token_id", "pad_token_id", "bos_token_id")
+    )
+    if same_vocab_size and same_special_tokens:
+        loaded_assistant_tokenizer = None
+    elif loaded_assistant_tokenizer is None:
+        raise ValueError(
+            "The assistant model has a different tokenizer than the main model. You should pass the assistant "
+            "tokenizer."
+        )
+
+    return loaded_assistant_model, loaded_assistant_tokenizer
+
+
+class PipelineException(Exception):
+    """
+    Raised by a [`Pipeline`] when handling __call__.
+
+    Args:
+        task (`str`): The task of the pipeline.
+        model (`str`): The model used by the pipeline.
+        reason (`str`): The error message to display.
+    """
+
+    def __init__(self, task: str, model: str, reason: str):
+        super().__init__(reason)
+
+        self.task = task
+        self.model = model
+
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling arguments for each [`~pipelines.Pipeline`].
+    """
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class PipelineDataFormat:
+    """
+    Base class for all the pipeline supported data format both for reading and writing. Supported data formats
+    currently includes:
+
+    - JSON
+    - CSV
+    - stdin/stdout (pipe)
+
+    `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
+    pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite: bool = False,
+    ):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(",") if column is not None else [""]
+        self.is_multi_columns = len(self.column) > 1
+
+        if self.is_multi_columns:
+            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
+
+        if output_path is not None and not overwrite:
+            if exists(abspath(self.output_path)):
+                raise OSError(f"{self.output_path} already exists on disk")
+
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError(f"{self.input_path} doesnt exist on disk")
+
+    @abstractmethod
+    def __iter__(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save(self, data: Union[dict, List[dict]]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+        """
+        raise NotImplementedError()
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        """
+        Save the provided data object as a pickle-formatted binary data on the disk.
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+
+        Returns:
+            `str`: Path where the data has been saved.
+        """
+        path, _ = os.path.splitext(self.output_path)
+        binary_path = os.path.extsep.join((path, "pickle"))
+
+        with open(binary_path, "wb+") as f_output:
+            pickle.dump(data, f_output)
+
+        return binary_path
+
+    @staticmethod
+    def from_str(
+        format: str,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ) -> "PipelineDataFormat":
+        """
+        Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
+
+        Args:
+            format (`str`):
+                The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
+            output_path (`str`, *optional*):
+                Where to save the outgoing data.
+            input_path (`str`, *optional*):
+                Where to look for the input data.
+            column (`str`, *optional*):
+                The column to read.
+            overwrite (`bool`, *optional*, defaults to `False`):
+                Whether or not to overwrite the `output_path`.
+
+        Returns:
+            [`~pipelines.PipelineDataFormat`]: The proper data format.
+        """
+        if format == "json":
+            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "csv":
+            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "pipe":
+            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        else:
+            raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)")
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using CSV data format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+    def __iter__(self):
+        with open(self.input_path, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.is_multi_columns:
+                    yield {k: row[c] for k, c in self.column}
+                else:
+                    yield row[self.column[0]]
+
+    def save(self, data: List[dict]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`List[dict]`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            if len(data) > 0:
+                writer = csv.DictWriter(f, list(data[0].keys()))
+                writer.writeheader()
+                writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using JSON file format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+        with open(input_path, "r") as f:
+            self._entries = json.load(f)
+
+    def __iter__(self):
+        for entry in self._entries:
+            if self.is_multi_columns:
+                yield {k: entry[c] for k, c in self.column}
+            else:
+                yield entry[self.column[0]]
+
+    def save(self, data: dict):
+        """
+        Save the provided data object in a json file.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+    """
+    Read data from piped input to the python process. For multi columns data, columns should separated by \t
+
+    If columns are provided, then the output will be a dictionary with {column_x: value_x}
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __iter__(self):
+        for line in sys.stdin:
+            # Split for multi-columns
+            if "\t" in line:
+                line = line.split("\t")
+                if self.column:
+                    # Dictionary to map arguments
+                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+                else:
+                    yield tuple(line)
+
+            # No dictionary to map arguments
+            else:
+                yield line
+
+    def save(self, data: dict):
+        """
+        Print the data.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        print(data)
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        if self.output_path is None:
+            raise KeyError(
+                "When using piped input on pipeline outputting large object requires an output file path. "
+                "Please provide such output path through --output argument."
+            )
+
+        return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+def build_pipeline_init_args(
+    has_tokenizer: bool = False,
+    has_feature_extractor: bool = False,
+    has_image_processor: bool = False,
+    has_processor: bool = False,
+    supports_binary_output: bool = True,
+) -> str:
+    docstring = r"""
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
+    if has_tokenizer:
+        docstring += r"""
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`]."""
+    if has_feature_extractor:
+        docstring += r"""
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`SequenceFeatureExtractor`]."""
+    if has_image_processor:
+        docstring += r"""
+        image_processor ([`BaseImageProcessor`]):
+            The image processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`BaseImageProcessor`]."""
+    if has_processor:
+        docstring += r"""
+        processor ([`ProcessorMixin`]):
+            The processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and
+            `image_processor`."""
+    docstring += r"""
+        modelcard (`str` or [`ModelCard`], *optional*):
+            Model card attributed to the model for this pipeline.
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        task (`str`, defaults to `""`):
+            A task-identifier for the pipeline.
+        num_workers (`int`, *optional*, defaults to 8):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
+            workers to be used.
+        batch_size (`int`, *optional*, defaults to 1):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
+            the batch to use, for inference this is not always beneficial, please read [Batching with
+            pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
+        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (`int`, *optional*, defaults to -1):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
+            the associated CUDA device id. You can pass native `torch.device` or a `str` too
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)"""
+    if supports_binary_output:
+        docstring += r"""
+        binary_output (`bool`, *optional*, defaults to `False`):
+            Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
+            the raw output data e.g. text."""
+    return docstring
+
+
+PIPELINE_INIT_ARGS = build_pipeline_init_args(
+    has_tokenizer=True,
+    has_feature_extractor=True,
+    has_image_processor=True,
+    has_processor=True,
+    supports_binary_output=True,
+)
+
+SUPPORTED_PEFT_TASKS = {
+    "document-question-answering": ["PeftModelForQuestionAnswering"],
+    "feature-extraction": ["PeftModelForFeatureExtraction", "PeftModel"],
+    "question-answering": ["PeftModelForQuestionAnswering"],
+    "summarization": ["PeftModelForSeq2SeqLM"],
+    "table-question-answering": ["PeftModelForQuestionAnswering"],
+    "text2text-generation": ["PeftModelForSeq2SeqLM"],
+    "text-classification": ["PeftModelForSequenceClassification"],
+    "sentiment-analysis": ["PeftModelForSequenceClassification"],
+    "text-generation": ["PeftModelForCausalLM"],
+    "token-classification": ["PeftModelForTokenClassification"],
+    "ner": ["PeftModelForTokenClassification"],
+    "translation": ["PeftModelForSeq2SeqLM"],
+    "translation_xx_to_yy": ["PeftModelForSeq2SeqLM"],
+    "zero-shot-classification": ["PeftModelForSequenceClassification"],
+}
+
+if is_torch_available():
+    from transformers.pipelines.pt_utils import (
+        PipelineChunkIterator,
+        PipelineDataset,
+        PipelineIterator,
+        PipelinePackIterator,
+    )
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(
+        has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True
+    )
+)
+class Pipeline(_ScikitCompat, PushToHubMixin):
+    """
+    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+    different pipelines.
+
+    Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
+    operations:
+
+        Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
+
+    Pipeline supports running on CPU or GPU through the device argument (see below).
+
+    Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
+    as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
+    constructor argument. If set to `True`, the output will be stored in the pickle format.
+    """
+
+    # Historically we have pipelines working with `tokenizer`, `feature_extractor`, and `image_processor`
+    # as separate processing components. While we have `processor` class that combines them, some pipelines
+    # might still operate with these components separately.
+    # With the addition of `processor` to `pipeline`, we want to avoid:
+    #  - loading `processor` for pipelines that still work with `image_processor` and `tokenizer` separately;
+    #  - loading `image_processor`/`tokenizer` as a separate component while we operate only with `processor`,
+    #    because `processor` will load required sub-components by itself.
+    # Below flags allow granular control over loading components and set to be backward compatible with current
+    # pipelines logic. You may override these flags when creating your pipeline. For example, for
+    # `zero-shot-object-detection` pipeline which operates with `processor` you should set `_load_processor=True`
+    # and all the rest flags to `False` to avoid unnecessary loading of the components.
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = True
+    _load_tokenizer = True
+
+    default_input_names = None
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+        image_processor: Optional[BaseImageProcessor] = None,
+        processor: Optional[ProcessorMixin] = None,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        args_parser: ArgumentHandler = None,
+        device: Union[int, "torch.device"] = None,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        binary_output: bool = False,
+        **kwargs,
+    ):
+        if framework is None:
+            framework, model = infer_framework_load_model(model, config=model.config)
+
+        self.task = task
+        self.model = model
+        self.tokenizer = tokenizer
+        self.feature_extractor = feature_extractor
+        self.image_processor = image_processor
+        self.processor = processor
+        self.modelcard = modelcard
+        self.framework = framework
+
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        if device is None:
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = 0
+
+        if is_torch_available() and self.framework == "pt":
+            if device == -1 and self.model.device is not None:
+                device = self.model.device
+            if isinstance(device, torch.device):
+                if (device.type == "xpu" and not is_torch_xpu_available(check_device=True)) or (
+                    device.type == "hpu" and not is_torch_hpu_available()
+                ):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
+                self.device = device
+            elif isinstance(device, str):
+                if ("xpu" in device and not is_torch_xpu_available(check_device=True)) or (
+                    "hpu" in device and not is_torch_hpu_available()
+                ):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+
+                self.device = torch.device(device)
+            elif device < 0:
+                self.device = torch.device("cpu")
+            elif is_torch_mlu_available():
+                self.device = torch.device(f"mlu:{device}")
+            elif is_torch_musa_available():
+                self.device = torch.device(f"musa:{device}")
+            elif is_torch_cuda_available():
+                self.device = torch.device(f"cuda:{device}")
+            elif is_torch_npu_available():
+                self.device = torch.device(f"npu:{device}")
+            elif is_torch_hpu_available():
+                self.device = torch.device(f"hpu:{device}")
+            elif is_torch_xpu_available(check_device=True):
+                self.device = torch.device(f"xpu:{device}")
+            elif is_torch_mps_available():
+                self.device = torch.device(f"mps:{device}")
+            else:
+                self.device = torch.device("cpu")
+        else:
+            self.device = device if device is not None else -1
+
+        if is_torch_available() and torch.distributed.is_initialized():
+            self.device = self.model.device
+        logger.warning(f"Device set to use {self.device}")
+
+        self.binary_output = binary_output
+        # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device
+        if (
+            self.framework == "pt"
+            and self.model.device != self.device
+            and not (isinstance(self.device, int) and self.device < 0)
+            and hf_device_map is None
+        ):
+            self.model.to(self.device)
+
+        # If the model can generate:
+        # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local
+        # tweaks to the generation config.
+        # 2 - load the assistant model if it is passed.
+        self.assistant_model, self.assistant_tokenizer = load_assistant_model(
+            self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None)
+        )
+        if self.model.can_generate():
+            self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+            self.generation_config = copy.deepcopy(self.model.generation_config)
+            # Update the generation config with task specific params if they exist
+            # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+            task_specific_params = self.model.config.task_specific_params
+            if task_specific_params is not None and task in task_specific_params:
+                this_task_params = task_specific_params.get(task)
+                if "prefix" in this_task_params:
+                    self.prefix = this_task_params.pop("prefix")
+                self.generation_config.update(**this_task_params)
+            # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+            if (
+                self.tokenizer is not None
+                and self.tokenizer.pad_token_id is not None
+                and self.generation_config.pad_token_id is None
+            ):
+                self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+
+        self.call_count = 0
+        self._batch_size = kwargs.pop("batch_size", None)
+        self._num_workers = kwargs.pop("num_workers", None)
+        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # In processor only mode, we can get the modality processors from the processor
+        if self.processor is not None and all(
+            [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None]
+        ):
+            self.tokenizer = getattr(self.processor, "tokenizer", None)
+            self.feature_extractor = getattr(self.processor, "feature_extractor", None)
+            self.image_processor = getattr(self.processor, "image_processor", None)
+
+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        """
+        Save the pipeline's model and tokenizer.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+            safe_serialization (`str`):
+                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+
+        if hasattr(self, "_registered_impl"):
+            # Add info to the config
+            pipeline_info = self._registered_impl.copy()
+            custom_pipelines = {}
+            for task, info in pipeline_info.items():
+                if info["impl"] != self.__class__:
+                    continue
+
+                info = info.copy()
+                module_name = info["impl"].__module__
+                last_module = module_name.split(".")[-1]
+                # Change classes into their names/full names
+                info["impl"] = f"{last_module}.{info['impl'].__name__}"
+                info["pt"] = tuple(c.__name__ for c in info["pt"])
+                info["tf"] = tuple(c.__name__ for c in info["tf"])
+
+                custom_pipelines[task] = info
+            self.model.config.custom_pipelines = custom_pipelines
+            # Save the pipeline custom code
+            custom_object_save(self, save_directory)
+
+        kwargs["safe_serialization"] = safe_serialization
+        self.model.save_pretrained(save_directory, **kwargs)
+
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory, **kwargs)
+
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory, **kwargs)
+
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(save_directory, **kwargs)
+
+        if self.modelcard is not None:
+            self.modelcard.save_pretrained(save_directory)
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    @property
+    def torch_dtype(self) -> Optional["torch.dtype"]:
+        """
+        Torch dtype of the model (if it's Pytorch model), `None` otherwise.
+        """
+        return getattr(self.model, "dtype", None)
+
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+
+        Returns:
+            Context manager
+
+        Examples:
+
+        ```python
+        # Explicitly ask for tensor allocation on CUDA device :0
+        pipe = pipeline(..., device=0)
+        with pipe.device_placement():
+            # Every framework specific tensor allocation will be done on the request device
+            output = pipe(...)
+        ```"""
+        if self.framework == "tf":
+            with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
+                yield
+        else:
+            if self.device.type == "cuda":
+                with torch.cuda.device(self.device):
+                    yield
+            elif self.device.type == "mlu":
+                with torch.mlu.device(self.device):
+                    yield
+            elif self.device.type == "musa":
+                with torch.musa.device(self.device):
+                    yield
+            elif self.device.type == "xpu":
+                with torch.xpu.device(self.device):
+                    yield
+            else:
+                yield
+
+    def ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+
+        Args:
+            inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
+                The tensors to place on `self.device`.
+            Recursive on lists **only**.
+
+        Return:
+            `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
+        """
+        return self._ensure_tensor_on_device(inputs, self.device)
+
+    def _ensure_tensor_on_device(self, inputs, device):
+        if isinstance(inputs, ModelOutput):
+            return ModelOutput(
+                {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+            )
+        elif isinstance(inputs, dict):
+            return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+        elif isinstance(inputs, UserDict):
+            return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
+        elif isinstance(inputs, list):
+            return [self._ensure_tensor_on_device(item, device) for item in inputs]
+        elif isinstance(inputs, tuple):
+            return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
+        elif isinstance(inputs, torch.Tensor):
+            return inputs.to(device)
+        else:
+            return inputs
+
+    def check_model_type(self, supported_models: Union[List[str], dict]):
+        """
+        Check if the model class is in supported by the pipeline.
+
+        Args:
+            supported_models (`List[str]` or `dict`):
+                The list of models supported by the pipeline, or a dictionary with model class values.
+        """
+        if not isinstance(supported_models, list):  # Create from a model mapping
+            supported_models_names = []
+            if self.task in SUPPORTED_PEFT_TASKS:
+                supported_models_names.extend(SUPPORTED_PEFT_TASKS[self.task])
+
+            for _, model_name in supported_models.items():
+                # Mapping can now contain tuples of models for the same configuration.
+                if isinstance(model_name, tuple):
+                    supported_models_names.extend(list(model_name))
+                else:
+                    supported_models_names.append(model_name)
+            if hasattr(supported_models, "_model_mapping"):
+                for _, model in supported_models._model_mapping._extra_content.items():
+                    if isinstance(model_name, tuple):
+                        supported_models_names.extend([m.__name__ for m in model])
+                    else:
+                        supported_models_names.append(model.__name__)
+            supported_models = supported_models_names
+        if self.model.__class__.__name__ not in supported_models:
+            logger.error(
+                f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
+                f" {supported_models}."
+            )
+
+    @abstractmethod
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        _sanitize_parameters will be called with any excessive named arguments from either `__init__` or `__call__`
+        methods. It should return 3 dictionaries of the resolved parameters used by the various `preprocess`,
+        `forward` and `postprocess` methods. Do not fill dictionaries if the caller didn't specify a kwargs. This
+        lets you keep defaults in function signatures, which is more "natural".
+
+        It is not meant to be called directly, it will be automatically called and the final parameters resolved by
+        `__init__` and `__call__`
+        """
+        raise NotImplementedError("_sanitize_parameters not implemented")
+
+    @abstractmethod
+    def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+        """
+        Preprocess will take the `input_` of a specific pipeline and return a dictionary of everything necessary for
+        `_forward` to run properly. It should contain at least one tensor, but might have arbitrary other items.
+        """
+        raise NotImplementedError("preprocess not implemented")
+
+    @abstractmethod
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        """
+        _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
+        involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
+        and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.
+
+        It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
+        code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
+        of the code (leading to faster inference).
+        """
+        raise NotImplementedError("_forward not implemented")
+
+    @abstractmethod
+    def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
+        """
+        Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
+        something more friendly. Generally it will output a list or a dict or results (containing just strings and
+        numbers).
+        """
+        raise NotImplementedError("postprocess not implemented")
+
+    def get_inference_context(self):
+        return torch.no_grad
+
+    def forward(self, model_inputs, **forward_params):
+        with self.device_placement():
+            if self.framework == "tf":
+                model_inputs["training"] = False
+                model_outputs = self._forward(model_inputs, **forward_params)
+            elif self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    model_outputs = self._forward(model_inputs, **forward_params)
+                    model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
+            else:
+                raise ValueError(f"Framework {self.framework} is not supported")
+        return model_outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if isinstance(inputs, collections.abc.Sized):
+            dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
+        else:
+            if num_workers > 1:
+                logger.warning(
+                    "For iterable dataset using num_workers>1 is likely to result"
+                    " in errors since everything is iterable, setting `num_workers=1`"
+                    " to guarantee correctness."
+                )
+                num_workers = 1
+            dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+    def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
+        if args:
+            logger.warning(f"Ignoring args : {args}")
+
+        if num_workers is None:
+            if self._num_workers is None:
+                num_workers = 0
+            else:
+                num_workers = self._num_workers
+        if batch_size is None:
+            if self._batch_size is None:
+                batch_size = 1
+            else:
+                batch_size = self._batch_size
+
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+        preprocess_params = {**self._preprocess_params, **preprocess_params}
+        forward_params = {**self._forward_params, **forward_params}
+        postprocess_params = {**self._postprocess_params, **postprocess_params}
+
+        self.call_count += 1
+        if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
+            logger.warning_once(
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                " dataset",
+            )
+
+        is_dataset = Dataset is not None and isinstance(inputs, Dataset)
+        is_generator = isinstance(inputs, types.GeneratorType)
+        is_list = isinstance(inputs, list)
+
+        is_iterable = is_dataset or is_generator or is_list
+
+        # TODO make the get_iterator work also for `tf` (and `flax`).
+        can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)
+
+        if is_list:
+            if can_use_iterator:
+                final_iterator = self.get_iterator(
+                    inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                )
+                outputs = list(final_iterator)
+                return outputs
+            else:
+                return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
+        elif can_use_iterator:
+            return self.get_iterator(
+                inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+            )
+        elif is_iterable:
+            return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
+        elif self.framework == "pt" and isinstance(self, ChunkPipeline):
+            return next(
+                iter(
+                    self.get_iterator(
+                        [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                    )
+                )
+            )
+        else:
+            return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
+
+    def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
+        return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]
+
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        model_inputs = self.preprocess(inputs, **preprocess_params)
+        model_outputs = self.forward(model_inputs, **forward_params)
+        outputs = self.postprocess(model_outputs, **postprocess_params)
+        return outputs
+
+    def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
+        # This function should become `get_iterator` again, this is a temporary
+        # easy solution.
+        for input_ in inputs:
+            yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
+
+
+Pipeline.push_to_hub = copy_func(Pipeline.push_to_hub)
+if Pipeline.push_to_hub.__doc__ is not None:
+    Pipeline.push_to_hub.__doc__ = Pipeline.push_to_hub.__doc__.format(
+        object="pipe", object_class="pipeline", object_files="pipeline file"
+    ).replace(".from_pretrained", "")
+
+
+class ChunkPipeline(Pipeline):
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        all_outputs = []
+        for model_inputs in self.preprocess(inputs, **preprocess_params):
+            model_outputs = self.forward(model_inputs, **forward_params)
+            all_outputs.append(model_outputs)
+        outputs = self.postprocess(all_outputs, **postprocess_params)
+        return outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if num_workers > 1:
+            logger.warning(
+                "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
+                " setting `num_workers=1` to guarantee correctness."
+            )
+            num_workers = 1
+        dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
+
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+
+class PipelineRegistry:
+    def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None:
+        self.supported_tasks = supported_tasks
+        self.task_aliases = task_aliases
+
+    def get_supported_tasks(self) -> List[str]:
+        supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
+        supported_task.sort()
+        return supported_task
+
+    def check_task(self, task: str) -> Tuple[str, Dict, Any]:
+        if task in self.task_aliases:
+            task = self.task_aliases[task]
+        if task in self.supported_tasks:
+            targeted_task = self.supported_tasks[task]
+            return task, targeted_task, None
+
+        if task.startswith("translation"):
+            tokens = task.split("_")
+            if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
+                targeted_task = self.supported_tasks["translation"]
+                task = "translation"
+                return task, targeted_task, (tokens[1], tokens[3])
+            raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
+
+        raise KeyError(
+            f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
+        )
+
+    def register_pipeline(
+        self,
+        task: str,
+        pipeline_class: type,
+        pt_model: Optional[Union[type, Tuple[type]]] = None,
+        tf_model: Optional[Union[type, Tuple[type]]] = None,
+        default: Optional[Dict] = None,
+        type: Optional[str] = None,
+    ) -> None:
+        if task in self.supported_tasks:
+            logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")
+
+        if pt_model is None:
+            pt_model = ()
+        elif not isinstance(pt_model, tuple):
+            pt_model = (pt_model,)
+
+        if tf_model is None:
+            tf_model = ()
+        elif not isinstance(tf_model, tuple):
+            tf_model = (tf_model,)
+
+        task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}
+
+        if default is not None:
+            if "model" not in default and ("pt" in default or "tf" in default):
+                default = {"model": default}
+            task_impl["default"] = default
+
+        if type is not None:
+            task_impl["type"] = type
+
+        self.supported_tasks[task] = task_impl
+        pipeline_class._registered_impl = {task: task_impl}
+
+    def to_dict(self):
+        return self.supported_tasks
diff --git a/docs/transformers/build/lib/transformers/pipelines/depth_estimation.py b/docs/transformers/build/lib/transformers/pipelines/depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2203ac09c9cf9b6e9a51055c60678f5266ddd439
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/depth_estimation.py
@@ -0,0 +1,133 @@
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class DepthEstimationPipeline(Pipeline):
+    """
+    Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
+    >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
+    >>> # This is a tensor with the values being the depth expressed in meters for each pixel
+    >>> output["predicted_depth"].shape
+    torch.Size([1, 384, 384])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"depth-estimation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Predict the depth(s) of the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            parameters (`Dict`, *optional*):
+                A dictionary of argument names to parameter values, to control pipeline behaviour.
+                The only parameter available right now is `timeout`, which is the length of time, in seconds,
+                that the pipeline should wait before giving up on trying to download an image.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
+            - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if isinstance(parameters, dict) and "timeout" in parameters:
+            preprocess_params["timeout"] = parameters["timeout"]
+        return preprocess_params, {}, {}
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        model_inputs["target_size"] = image.size[::-1]
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        outputs = self.image_processor.post_process_depth_estimation(
+            model_outputs,
+            # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not*
+            # replace with `target_sizes = [model_outputs["target_size"]]`
+            [model_outputs["target_size"]],
+        )
+
+        formatted_outputs = []
+        for output in outputs:
+            depth = output["predicted_depth"].detach().cpu().numpy()
+            depth = (depth - depth.min()) / (depth.max() - depth.min())
+            depth = Image.fromarray((depth * 255).astype("uint8"))
+
+            formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth})
+
+        return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs
diff --git a/docs/transformers/build/lib/transformers/pipelines/document_question_answering.py b/docs/transformers/build/lib/transformers/pipelines/document_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..899a7cc53908a8a42048f4a54d400501244c6556
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/document_question_answering.py
@@ -0,0 +1,521 @@
+# Copyright 2022 The Impira Team and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_pytesseract_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+from .question_answering import select_starts_ends
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+
+TESSERACT_LOADED = False
+if is_pytesseract_available():
+    TESSERACT_LOADED = True
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+# normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py.
+# However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an
+# unnecessary dependency.
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+    # apply OCR
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    image_width, image_height = image.size
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    if len(words) != len(normalized_boxes):
+        raise ValueError("Not as many words as there are bounding boxes")
+
+    return words, normalized_boxes
+
+
+class ModelType(ExplicitEnum):
+    LayoutLM = "layoutlm"
+    LayoutLMv2andv3 = "layoutlmv2andv3"
+    VisionEncoderDecoder = "vision_encoder_decoder"
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
+class DocumentQuestionAnsweringPipeline(ChunkPipeline):
+    # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
+    """
+    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
+    similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
+    words/boxes) as input instead of text context.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
+    >>> document_qa(
+    ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+    ...     question="What is the invoice number?",
+    ... )
+    [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"document-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )
+
+        if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
+            self.model_type = ModelType.VisionEncoderDecoder
+            if self.model.config.encoder.model_type != "donut-swin":
+                raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
+        else:
+            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
+            if self.model.config.__class__.__name__ == "LayoutLMConfig":
+                self.model_type = ModelType.LayoutLM
+            else:
+                self.model_type = ModelType.LayoutLMv2andv3
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        doc_stride=None,
+        max_question_len=None,
+        lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        max_answer_len=None,
+        max_seq_len=None,
+        top_k=None,
+        handle_impossible_answer=None,
+        timeout=None,
+        **kwargs,
+    ):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+        if lang is not None:
+            preprocess_params["lang"] = lang
+        if tesseract_config is not None:
+            preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str],
+        question: Optional[str] = None,
+        word_boxes: Tuple[str, List[float]] = None,
+        **kwargs,
+    ):
+        """
+        Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
+        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
+        provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
+        LayoutLM-like models which require them as input. For Donut, no OCR is run.
+
+        You can invoke the pipeline several ways:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline(image=image, question=question, word_boxes=word_boxes)`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])`
+
+        Args:
+            image (`str` or `PIL.Image`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+            question (`str`):
+                A question to ask of the document.
+            word_boxes (`List[str, Tuple[float, float, float, float]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the
+                pipeline will use these words and boxes instead of running OCR on the image to derive them for models
+                that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the
+                pipeline without having to re-run it each time.
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            lang (`str`, *optional*):
+                Language to use while running OCR. Defaults to english.
+            tesseract_config (`str`, *optional*):
+                Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **answer** (`str`) -- The answer to the question.
+            - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
+        """
+        if isinstance(question, str):
+            inputs = {"question": question, "image": image}
+            if word_boxes is not None:
+                inputs["word_boxes"] = word_boxes
+        else:
+            inputs = image
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(
+        self,
+        input,
+        padding="do_not_pad",
+        doc_stride=None,
+        max_seq_len=None,
+        word_boxes: Tuple[str, List[float]] = None,
+        lang=None,
+        tesseract_config="",
+        timeout=None,
+    ):
+        # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
+        # to support documents with enough tokens that overflow the model's window
+        if max_seq_len is None:
+            max_seq_len = self.tokenizer.model_max_length
+
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 256)
+
+        image = None
+        image_features = {}
+        if input.get("image", None) is not None:
+            image = load_image(input["image"], timeout=timeout)
+            if self.image_processor is not None:
+                image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    image_inputs = image_inputs.to(self.torch_dtype)
+                image_features.update(image_inputs)
+            elif self.feature_extractor is not None:
+                image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
+            elif self.model_type == ModelType.VisionEncoderDecoder:
+                raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
+
+        words, boxes = None, None
+        if not self.model_type == ModelType.VisionEncoderDecoder:
+            if "word_boxes" in input:
+                words = [x[0] for x in input["word_boxes"]]
+                boxes = [x[1] for x in input["word_boxes"]]
+            elif "words" in image_features and "boxes" in image_features:
+                words = image_features.pop("words")[0]
+                boxes = image_features.pop("boxes")[0]
+            elif image is not None:
+                if not TESSERACT_LOADED:
+                    raise ValueError(
+                        "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
+                        " but pytesseract is not available"
+                    )
+                if TESSERACT_LOADED:
+                    words, boxes = apply_tesseract(image, lang=lang, tesseract_config=tesseract_config)
+            else:
+                raise ValueError(
+                    "You must provide an image or word_boxes. If you provide an image, the pipeline will automatically"
+                    " run OCR to derive words and boxes"
+                )
+
+        if self.tokenizer.padding_side != "right":
+            raise ValueError(
+                "Document question answering only supports tokenizers whose padding side is 'right', not"
+                f" {self.tokenizer.padding_side}"
+            )
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            task_prompt = f"<s_docvqa><s_question>{input['question']}</s_question><s_answer>"
+            # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
+            encoding = {
+                "inputs": image_features["pixel_values"],
+                "decoder_input_ids": self.tokenizer(
+                    task_prompt, add_special_tokens=False, return_tensors=self.framework
+                ).input_ids,
+                "return_dict_in_generate": True,
+            }
+            yield {
+                **encoding,
+                "p_mask": None,
+                "word_ids": None,
+                "words": None,
+                "output_attentions": True,
+                "is_last": True,
+            }
+        else:
+            tokenizer_kwargs = {}
+            if self.model_type == ModelType.LayoutLM:
+                tokenizer_kwargs["text"] = input["question"].split()
+                tokenizer_kwargs["text_pair"] = words
+                tokenizer_kwargs["is_split_into_words"] = True
+            else:
+                tokenizer_kwargs["text"] = [input["question"]]
+                tokenizer_kwargs["text_pair"] = [words]
+                tokenizer_kwargs["boxes"] = [boxes]
+
+            encoding = self.tokenizer(
+                padding=padding,
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                truncation="only_second",
+                return_overflowing_tokens=True,
+                **tokenizer_kwargs,
+            )
+            # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+            # FIXME: ydshieh and/or Narsil
+            encoding.pop("overflow_to_sample_mapping", None)  # We do not use this
+
+            num_spans = len(encoding["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            # This logic mirrors the logic in the question_answering pipeline
+            p_mask = [[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)]
+            for span_idx in range(num_spans):
+                if self.framework == "pt":
+                    span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()}
+                    if "pixel_values" in image_features:
+                        span_encoding["image"] = image_features["pixel_values"]
+                else:
+                    raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+
+                input_ids_span_idx = encoding["input_ids"][span_idx]
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+
+                # For each span, place a bounding box [0,0,0,0] for question and CLS tokens, [1000,1000,1000,1000]
+                # for SEP tokens, and the word's bounding box for words in the original document.
+                if "boxes" not in tokenizer_kwargs:
+                    bbox = []
+                    for input_id, sequence_id, word_id in zip(
+                        encoding.input_ids[span_idx],
+                        encoding.sequence_ids(span_idx),
+                        encoding.word_ids(span_idx),
+                    ):
+                        if sequence_id == 1:
+                            bbox.append(boxes[word_id])
+                        elif input_id == self.tokenizer.sep_token_id:
+                            bbox.append([1000] * 4)
+                        else:
+                            bbox.append([0] * 4)
+
+                    if self.framework == "pt":
+                        span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0)
+                    elif self.framework == "tf":
+                        raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+                yield {
+                    **span_encoding,
+                    "p_mask": p_mask[span_idx],
+                    "word_ids": encoding.word_ids(span_idx),
+                    "words": words,
+                    "is_last": span_idx == num_spans - 1,
+                }
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        p_mask = model_inputs.pop("p_mask", None)
+        word_ids = model_inputs.pop("word_ids", None)
+        words = model_inputs.pop("words", None)
+        is_last = model_inputs.pop("is_last", False)
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+
+        model_outputs = dict(model_outputs.items())
+        model_outputs["p_mask"] = p_mask
+        model_outputs["word_ids"] = word_ids
+        model_outputs["words"] = words
+        model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
+        model_outputs["is_last"] = is_last
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=1, **kwargs):
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
+        else:
+            answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)
+
+        answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
+        return answers
+
+    def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
+        sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]
+
+        # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
+        # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context).
+        sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+        ret = {
+            "answer": None,
+        }
+
+        answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
+        if answer is not None:
+            ret["answer"] = answer.group(1).strip()
+        return ret
+
+    def postprocess_extractive_qa(
+        self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            words = output["words"]
+
+            if self.framework == "pt" and output["start_logits"].dtype in (torch.bfloat16, torch.float16):
+                output["start_logits"] = output["start_logits"].float()
+            if self.framework == "pt" and output["end_logits"].dtype in (torch.bfloat16, torch.float16):
+                output["end_logits"] = output["end_logits"].float()
+
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start=output["start_logits"],
+                end=output["end_logits"],
+                p_mask=output["p_mask"],
+                attention_mask=output["attention_mask"].numpy()
+                if output.get("attention_mask", None) is not None
+                else None,
+                min_null_score=min_null_score,
+                top_k=top_k,
+                handle_impossible_answer=handle_impossible_answer,
+                max_answer_len=max_answer_len,
+            )
+            word_ids = output["word_ids"]
+            for start, end, score in zip(starts, ends, scores):
+                word_start, word_end = word_ids[start], word_ids[end]
+                if word_start is not None and word_end is not None:
+                    answers.append(
+                        {
+                            "score": float(score),
+                            "answer": " ".join(words[word_start : word_end + 1]),
+                            "start": word_start,
+                            "end": word_end,
+                        }
+                    )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})
+
+        return answers
diff --git a/docs/transformers/build/lib/transformers/pipelines/feature_extraction.py b/docs/transformers/build/lib/transformers/pipelines/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d67a615ac02d29625f51242e1f747b39e6118bd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/feature_extraction.py
@@ -0,0 +1,86 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
+    r"""
+        tokenize_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the tokenizer.
+        return_tensors (`bool`, *optional*):
+            If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
+)
+class FeatureExtractionPipeline(Pipeline):
+    """
+    Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
+    >>> result = extractor("This is a simple test.", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string.
+    torch.Size([1, 8, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"feature-extraction"`.
+
+    All models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
+        if tokenize_kwargs is None:
+            tokenize_kwargs = {}
+
+        if truncation is not None:
+            if "truncation" in tokenize_kwargs:
+                raise ValueError(
+                    "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
+                )
+            tokenize_kwargs["truncation"] = truncation
+
+        preprocess_params = tokenize_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
+        model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, return_tensors=False):
+        # [0] is the first available tensor, logits or last_hidden_state.
+        if return_tensors:
+            return model_outputs[0]
+        if self.framework == "pt":
+            return model_outputs[0].tolist()
+        elif self.framework == "tf":
+            return model_outputs[0].numpy().tolist()
+
+    def __call__(self, *args, **kwargs):
+        """
+        Extract the features of the input(s).
+
+        Args:
+            args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
+
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/docs/transformers/build/lib/transformers/pipelines/fill_mask.py b/docs/transformers/build/lib/transformers/pipelines/fill_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14f54118486b971f64b0985fe2dc688de52f863
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/fill_mask.py
@@ -0,0 +1,273 @@
+from typing import Dict
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..tf_utils import stable_softmax
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        top_k (`int`, *optional*, defaults to 5):
+            The number of predictions to return.
+        targets (`str` or `List[str]`, *optional*):
+            When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+            vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
+            token will be used (with a warning, and that might be slower).
+        tokenizer_kwargs (`dict`, *optional*):
+            Additional dictionary of keyword arguments passed along to the tokenizer.""",
+)
+class FillMaskPipeline(Pipeline):
+    """
+    Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
+    examples](../task_summary#masked-language-modeling) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> fill_masker("This is a simple [MASK].")
+    [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"fill-mask"`.
+
+    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+    which includes the bi-directional models in the library. See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
+
+    <Tip>
+
+    This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
+    masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
+    joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
+
+    </Tip>
+
+    <Tip>
+
+    This pipeline now supports tokenizer_kwargs. For example try:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> tokenizer_kwargs = {"truncation": True}
+    >>> fill_masker(
+    ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
+    ...     tokenizer_kwargs=tokenizer_kwargs,
+    ... )
+    ```
+
+
+    </Tip>
+
+
+    """
+
+    def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
+        elif self.framework == "pt":
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
+        else:
+            raise ValueError("Unsupported framework")
+        return masked_index
+
+    def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
+        masked_index = self.get_masked_index(input_ids)
+        numel = np.prod(masked_index.shape)
+        if numel < 1:
+            raise PipelineException(
+                "fill-mask",
+                self.model.base_model_prefix,
+                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
+            )
+
+    def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
+        if isinstance(model_inputs, list):
+            for model_input in model_inputs:
+                self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
+        else:
+            for input_ids in model_inputs["input_ids"]:
+                self._ensure_exactly_one_mask_token(input_ids)
+
+    def preprocess(
+        self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
+    ) -> Dict[str, GenericTensor]:
+        if return_tensors is None:
+            return_tensors = self.framework
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+
+        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        self.ensure_exactly_one_mask_token(model_inputs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        model_outputs["input_ids"] = model_inputs["input_ids"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, target_ids=None):
+        # Cap top_k if there are targets
+        if target_ids is not None and target_ids.shape[0] < top_k:
+            top_k = target_ids.shape[0]
+        input_ids = model_outputs["input_ids"][0]
+        outputs = model_outputs["logits"]
+
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
+
+            outputs = outputs.numpy()
+
+            logits = outputs[0, masked_index, :]
+            probs = stable_softmax(logits, axis=-1)
+            if target_ids is not None:
+                probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
+                probs = tf.expand_dims(probs, 0)
+
+            topk = tf.math.top_k(probs, k=top_k)
+            values, predictions = topk.values.numpy(), topk.indices.numpy()
+        else:
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
+            # Fill mask pipeline supports only one ${mask_token} per sample
+
+            logits = outputs[0, masked_index, :]
+            probs = logits.softmax(dim=-1)
+            if target_ids is not None:
+                probs = probs[..., target_ids]
+
+            values, predictions = probs.topk(top_k)
+
+        result = []
+        single_mask = values.shape[0] == 1
+        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
+            row = []
+            for v, p in zip(_values, _predictions):
+                # Copy is important since we're going to modify this array in place
+                tokens = input_ids.numpy().copy()
+                if target_ids is not None:
+                    p = target_ids[p].tolist()
+
+                tokens[masked_index[i]] = p
+                # Filter padding out:
+                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
+                # Originally we skip special tokens to give readable output.
+                # For multi masks though, the other [MASK] would be removed otherwise
+                # making the output look odd, so we add them back
+                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
+                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
+                row.append(proposition)
+            result.append(row)
+        if single_mask:
+            return result[0]
+        return result
+
+    def get_target_ids(self, targets, top_k=None):
+        if isinstance(targets, str):
+            targets = [targets]
+        try:
+            vocab = self.tokenizer.get_vocab()
+        except Exception:
+            vocab = {}
+        target_ids = []
+        for target in targets:
+            id_ = vocab.get(target, None)
+            if id_ is None:
+                input_ids = self.tokenizer(
+                    target,
+                    add_special_tokens=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    max_length=1,
+                    truncation=True,
+                )["input_ids"]
+                if len(input_ids) == 0:
+                    logger.warning(
+                        f"The specified target token `{target}` does not exist in the model vocabulary. "
+                        "We cannot replace it with anything meaningful, ignoring it"
+                    )
+                    continue
+                id_ = input_ids[0]
+                # XXX: If users encounter this pass
+                # it becomes pretty slow, so let's make sure
+                # The warning enables them to fix the input to
+                # get faster performance.
+                logger.warning(
+                    f"The specified target token `{target}` does not exist in the model vocabulary. "
+                    f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
+                )
+            target_ids.append(id_)
+        target_ids = list(set(target_ids))
+        if len(target_ids) == 0:
+            raise ValueError("At least one target must be provided when passed.")
+        target_ids = np.array(target_ids)
+        return target_ids
+
+    def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
+        preprocess_params = {}
+
+        if tokenizer_kwargs is not None:
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        postprocess_params = {}
+
+        if targets is not None:
+            target_ids = self.get_target_ids(targets, top_k)
+            postprocess_params["target_ids"] = target_ids
+
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        if self.tokenizer.mask_token_id is None:
+            raise PipelineException(
+                "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
+            )
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs, **kwargs):
+        """
+        Fill the masked token in the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]`):
+                One or several texts (or one list of prompts) with masked tokens.
+            targets (`str` or `List[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+            - **score** (`float`) -- The corresponding probability.
+            - **token** (`int`) -- The predicted token id (to replace the masked one).
+            - **token_str** (`str`) -- The predicted token (to replace the masked one).
+        """
+        outputs = super().__call__(inputs, **kwargs)
+        if isinstance(inputs, list) and len(inputs) == 1:
+            return outputs[0]
+        return outputs
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_classification.py b/docs/transformers/build/lib/transformers/pipelines/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b318df5527e4e641e7e66e4ef36d4a9182809d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_classification.py
@@ -0,0 +1,226 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.pipelines.text_classification.sigmoid
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+# Copied from transformers.pipelines.text_classification.softmax
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+# Copied from transformers.pipelines.text_classification.ClassificationFunction
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class ImageClassificationPipeline(Pipeline):
+    """
+    Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
+    image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
+    >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+    """
+
+    function_to_apply: ClassificationFunction = ClassificationFunction.NONE
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction(function_to_apply.lower())
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
+        if function_to_apply is None:
+            if self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        outputs = model_outputs["logits"][0]
+        if self.framework == "pt" and outputs.dtype in (torch.bfloat16, torch.float16):
+            outputs = outputs.to(torch.float32).numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        dict_scores.sort(key=lambda x: x["score"], reverse=True)
+        if top_k is not None:
+            dict_scores = dict_scores[:top_k]
+
+        return dict_scores
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_feature_extraction.py b/docs/transformers/build/lib/transformers/pipelines/image_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..391eb2b3aec714dbac61fe46bddc7ee74f10cd2f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_feature_extraction.py
@@ -0,0 +1,112 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings, is_vision_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from ..image_utils import load_image
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    """
+        image_processor_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the image processor e.g.
+                {"size": {"height": 100, "width": 100}}
+        pool (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
+    """,
+)
+class ImageFeatureExtractionPipeline(Pipeline):
+    """
+    Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
+    >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
+    torch.Size([1, 197, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"image-feature-extraction"`.
+
+    All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
+        preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
+
+        postprocess_params = {}
+        if pool is not None:
+            postprocess_params["pool"] = pool
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, pool=None, return_tensors=False):
+        pool = pool if pool is not None else False
+
+        if pool:
+            if "pooler_output" not in model_outputs:
+                raise ValueError(
+                    "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
+                )
+            outputs = model_outputs["pooler_output"]
+        else:
+            # [0] is the first available tensor, logits or last_hidden_state.
+            outputs = model_outputs[0]
+
+        if return_tensors:
+            return outputs
+        if self.framework == "pt":
+            return outputs.tolist()
+        elif self.framework == "tf":
+            return outputs.numpy().tolist()
+
+    def __call__(self, *args, **kwargs):
+        """
+        Extract the features of the input(s).
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_segmentation.py b/docs/transformers/build/lib/transformers/pipelines/image_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d388e591bf9df45c4905a6c8ff86fdce1e123906
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_segmentation.py
@@ -0,0 +1,220 @@
+from typing import Any, Dict, List, Union
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+    )
+
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageSegmentationPipeline(Pipeline):
+    """
+    Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
+    their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
+    >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    >>> len(segments)
+    2
+
+    >>> segments[0]["label"]
+    'bird'
+
+    >>> segments[1]["label"]
+    'bird'
+
+    >>> type(segments[0]["mask"])  # This is a black and white mask showing where is the bird on the original image.
+    <class 'PIL.Image.Image'>
+
+    >>> segments[0]["mask"].size
+    (768, 512)
+    ```
+
+
+    This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-segmentation"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        if "subtask" in kwargs:
+            postprocess_kwargs["subtask"] = kwargs["subtask"]
+            preprocess_kwargs["subtask"] = kwargs["subtask"]
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        if "mask_threshold" in kwargs:
+            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+        if "overlap_mask_area_threshold" in kwargs:
+            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+
+        return preprocess_kwargs, {}, postprocess_kwargs
+
+    def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
+        """
+        Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            subtask (`str`, *optional*):
+                Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+                capabilities. If not set, the pipeline will attempt tp resolve in the following order:
+                  `panoptic`, `instance`, `semantic`.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Probability threshold to filter out predicted masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
+            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
+            corresponding to each image.
+
+            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+            the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
+              the original image. Returns a mask filled with zeros if no object is found.
+            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+              "object" described by the label and the mask.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = [(image.height, image.width)]
+        if self.model.config.__class__.__name__ == "OneFormerConfig":
+            if subtask is None:
+                kwargs = {}
+            else:
+                kwargs = {"task_inputs": [subtask]}
+            inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
+            inputs["task_inputs"] = self.tokenizer(
+                inputs["task_inputs"],
+                padding="max_length",
+                max_length=self.model.config.task_seq_len,
+                return_tensors=self.framework,
+            )["input_ids"]
+        else:
+            inputs = self.image_processor(images=[image], return_tensors="pt")
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(
+        self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
+    ):
+        fn = None
+        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+            fn = self.image_processor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+            fn = self.image_processor.post_process_instance_segmentation
+
+        if fn is not None:
+            outputs = fn(
+                model_outputs,
+                threshold=threshold,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
+            )[0]
+
+            annotation = []
+            segmentation = outputs["segmentation"]
+
+            for segment in outputs["segments_info"]:
+                mask = (segmentation == segment["id"]) * 255
+                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                label = self.model.config.id2label[segment["label_id"]]
+                score = segment["score"]
+                annotation.append({"score": score, "label": label, "mask": mask})
+
+        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+            outputs = self.image_processor.post_process_semantic_segmentation(
+                model_outputs, target_sizes=model_outputs["target_size"]
+            )[0]
+
+            annotation = []
+            segmentation = outputs.numpy()
+            labels = np.unique(segmentation)
+
+            for label in labels:
+                mask = (segmentation == label) * 255
+                mask = Image.fromarray(mask.astype(np.uint8), mode="L")
+                label = self.model.config.id2label[label]
+                annotation.append({"score": None, "label": label, "mask": mask})
+        else:
+            raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
+        return annotation
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_text_to_text.py b/docs/transformers/build/lib/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..537d0a854370955c413568e22d90d4cebd1cc2bf
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,444 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from collections.abc import Iterable  # pylint: disable=g-importing-member
+from typing import Dict, List, Optional, Union
+
+from ..processing_utils import ProcessingKwargs, Unpack
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_images, valid_images
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = "<image>"
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        messages = add_images_to_messages(messages, images)
+
+        self.messages = messages
+
+
+def add_images_to_messages(
+    messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]]
+):
+    """
+    Retrieve and combine images from the chat and the images passed as input.
+    """
+    if images is None:
+        images = []
+    elif not isinstance(images, Iterable) or isinstance(images, str):
+        images = [images]
+    idx_images = 0
+    for message in messages:
+        for content in message["content"]:
+            if not isinstance(content, dict):
+                continue
+            content_type = content.get("type")
+            if content_type == "image":
+                if not any(key in content for key in ["image", "url", "path", "base64"]):
+                    if idx_images < len(images):
+                        # Insert the image passed as argument in the chat message
+                        content["image"] = images[idx_images]
+                        idx_images += 1
+                    else:
+                        raise ValueError(
+                            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+                        )
+            # Add support for OpenAI/TGI chat format
+            elif content_type == "image_url":
+                if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                    # Rewrite content to be in the Transformers chat format
+                    content["type"] = "image"
+                    content["image"] = content["image_url"]["url"]
+                    del content["image_url"]
+                else:
+                    raise ValueError(
+                        "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                    )
+
+    # The number of images passed should be consistent with the number of images in the chat without an image key
+    if idx_images != len(images):
+        raise ValueError(
+            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+        )
+
+    return messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+    """
+    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+    When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+    [{'generated_text': 'a photo of two birds'}]
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+    >>> messages = [
+    >>>     {
+    >>>         "role": "user",
+    >>>         "content": [
+    >>>             {
+    >>>                 "type": "image",
+    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+    >>>             },
+    >>>             {"type": "text", "text": "Describe this image."},
+    >>>         ],
+    >>>     },
+    >>>     {
+    >>>         "role": "assistant",
+    >>>         "content": [
+    >>>             {"type": "text", "text": "There is a dog and"},
+    >>>         ],
+    >>>     },
+    >>> ]
+    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+    [{'input_text': [{'role': 'user',
+        'content': [{'type': 'image',
+        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+        {'type': 'text', 'text': 'Describe this image.'}]},
+    {'role': 'assistant',
+        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-text-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+    """
+
+    _load_processor = True
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(
+        self,
+        max_new_tokens=None,
+        generate_kwargs=None,
+        timeout=None,
+        return_full_text=None,
+        return_tensors=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        stop_sequence=None,
+        continue_final_message=None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        forward_kwargs = {}
+        preprocess_params = {}
+        postprocess_params = {}
+        preprocess_params.update(kwargs)
+
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+
+        if max_new_tokens is not None:
+            if "generate_kwargs" not in forward_kwargs:
+                forward_kwargs["generate_kwargs"] = {}
+            if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+                raise ValueError(
+                    "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+                    " please use only one"
+                )
+            forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+        if return_full_text is not None and return_type is None:
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if stop_sequence is not None:
+            stop_sequence_ids = self.processor.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                logger.warning_once(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+        return preprocess_params, forward_kwargs, postprocess_params
+
+    def __call__(
+        self,
+        images: Optional[
+            Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]]
+        ] = None,
+        text: Optional[Union[str, List[str], List[dict]]] = None,
+        **kwargs,
+    ):
+        """
+        Generate a text given text and the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+            text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
+                The text to be used for generation. If a list of strings is passed, the length of the list should be the
+                same as the number of images. Text can also follow the chat format: a list of dictionaries where each
+                dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
+                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
+                containing the text of the message and the type of the message. The type of the message can be either
+                'text' or 'image'. If the type is 'image', no text is needed.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
+                ids of the generated text.
+            - **input_text** (`str`) -- The input text.
+        """
+        if images is None and text is None:
+            raise ValueError("You must at least provide either text or images.")
+        if images is not None and text is None and not valid_images(images):
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            return super().__call__(images, **kwargs)
+
+        if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
+            # We have one or more prompts in list-of-dicts format, so this is chat mode
+            if isinstance(text[0], dict):
+                return super().__call__(Chat(text, images), **kwargs)
+            else:
+                if images is None:
+                    images = [None] * len(text)
+                chats = [Chat(chat, image) for chat, image in zip(text, images)]  # 🐈 🐈 🐈
+                return super().__call__(chats, **kwargs)
+
+        # encourage the user to use the chat format if supported
+        if getattr(self.processor, "chat_template", None) is not None:
+            logger.warning_once(
+                "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. "
+                "Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating"
+            )
+
+        # support text only generation
+        if images is None:
+            return super().__call__(text, **kwargs)
+        if text is None:
+            raise ValueError("You must provide text for this pipeline.")
+
+        return super().__call__({"images": images, "text": text}, **kwargs)
+
+    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs):
+        if isinstance(inputs, Chat):
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = inputs.messages[-1]["role"] == "assistant"
+            model_inputs = self.processor.apply_chat_template(
+                inputs.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_tensors=self.framework,
+                tokenize=True,
+                return_dict=True,
+            )
+            model_inputs["text"] = inputs
+            return model_inputs
+        # In case we only have text inputs
+        if isinstance(inputs, (list, tuple, str)):
+            images = None
+            text = inputs
+            inputs_text = inputs
+        else:
+            images = load_images(inputs["images"], timeout=timeout)
+            text = inputs["text"]
+            inputs_text = inputs["text"]
+
+        # if batched text inputs, we set padding to True unless specified otherwise
+        if isinstance(text, (list, tuple)) and len(text) > 1:
+            processing_kwargs.setdefault("padding", True)
+        model_inputs = self.processor(images=images, text=text, return_tensors=self.framework, **processing_kwargs).to(
+            dtype=self.torch_dtype
+        )
+
+        model_inputs["text"] = inputs_text
+
+        return model_inputs
+
+    def _forward(self, model_inputs, generate_kwargs=None):
+        generate_kwargs = {} if generate_kwargs is None else generate_kwargs
+        prompt_text = model_inputs.pop("text")
+        input_ids = (
+            model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"]
+        )  # for decoder-only models
+        generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
+
+        return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
+
+    def postprocess(
+        self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None, **postprocess_kwargs
+    ):
+        input_texts = model_outputs["prompt_text"]
+        input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
+        generated_sequence = model_outputs["generated_sequence"]
+        input_ids = model_outputs["input_ids"]
+        if return_type == ReturnType.TENSORS:
+            return [
+                {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
+                for i in range(len(input_texts))
+            ]
+
+        # Decode inputs and outputs the same way to remove input text from generated text if present
+        generated_texts = self.processor.post_process_image_text_to_text(generated_sequence, **postprocess_kwargs)
+        decoded_inputs = self.processor.post_process_image_text_to_text(input_ids, **postprocess_kwargs)
+
+        # Force consistent behavior for including the input text in the output
+        if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+            # Remove the input text from the generated text if the generated text starts with the input text
+            # (accounting for the possibility of a space between the input and generated text)
+            new_generated_texts = []
+            for text_generated, decoded_input in zip(generated_texts, decoded_inputs):
+                # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
+                index_input_text = text_generated.find(decoded_input)
+                # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
+                if 0 <= index_input_text <= 2:
+                    # If the input text is found, we remove it
+                    new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
+                else:
+                    new_generated_texts.append(text_generated)
+            generated_texts = new_generated_texts
+        if return_type == ReturnType.FULL_TEXT:
+            full_texts = []
+            for prompt_text, generated_text in zip(input_texts, generated_texts):
+                if isinstance(prompt_text, str):
+                    generated_text = prompt_text + generated_text
+                elif isinstance(prompt_text, Chat):
+                    if continue_final_message is None:
+                        # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                        # default because very few models support multiple separate, consecutive assistant messages
+                        continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                    if continue_final_message:
+                        # With assistant prefill, concat onto the end of the last message
+                        new_text = dict(prompt_text.messages[-1]["content"][-1].items())
+                        new_text["text"] += generated_text
+                        generated_text = list(prompt_text.messages)[:-1] + [
+                            {
+                                "role": prompt_text.messages[-1]["role"],
+                                "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
+                            }
+                        ]
+                    else:
+                        # When we're not starting from a prefill, the output is a new assistant message
+                        generated_text = list(prompt_text.messages) + [
+                            {"role": "assistant", "content": generated_text}
+                        ]
+                full_texts.append(generated_text)
+            generated_texts = full_texts
+
+        records = [
+            {
+                "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
+                "generated_text": generated_text,
+            }
+            for input_text, generated_text in zip(input_texts, generated_texts)
+        ]
+
+        return records
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_to_image.py b/docs/transformers/build/lib/transformers/pipelines/image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb66359a4dddea48519f2de2dc69e86cd4ac5645
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_to_image.py
@@ -0,0 +1,136 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageToImagePipeline(Pipeline):
+    """
+    Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
+    image input.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> from transformers import pipeline
+
+    >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
+    >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+    >>> img = img.resize((64, 64))
+    >>> upscaled_img = upscaler(img)
+    >>> img.size
+    (64, 64)
+
+    >>> upscaled_img.size
+    (144, 144)
+    ```
+
+    This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-to-image"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+        forward_params = {}
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "head_mask" in kwargs:
+            forward_params["head_mask"] = kwargs["head_mask"]
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
+    ) -> Union["Image.Image", List["Image.Image"]]:
+        """
+        Transform the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+
+        Return:
+            An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
+            single image, the return will be also a single image, if the input is a list of several images, it will
+            return a list of transformed images.
+        """
+        return super().__call__(images, **kwargs)
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        return inputs
+
+    def postprocess(self, model_outputs):
+        images = []
+        if "reconstruction" in model_outputs.keys():
+            outputs = model_outputs.reconstruction
+        for output in outputs:
+            output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            output = np.moveaxis(output, source=0, destination=-1)
+            output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
+            images.append(Image.fromarray(output))
+
+        return images if len(images) > 1 else images[0]
diff --git a/docs/transformers/build/lib/transformers/pipelines/image_to_text.py b/docs/transformers/build/lib/transformers/pipelines/image_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..32a3ec218dac305f93d8e41959200a78c590c8df
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/image_to_text.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class ImageToTextPipeline(Pipeline):
+    """
+    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
+    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'generated_text': 'two birds are standing next to each other '}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
+        forward_params = {}
+        preprocess_params = {}
+
+        if prompt is not None:
+            preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if max_new_tokens is not None:
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+
+            max_new_tokens (`int`, *optional*):
+                The amount of maximum tokens to generate. By default it will use `generate` default.
+
+            generate_kwargs (`Dict`, *optional*):
+                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+            - **generated_text** (`str`) -- The generated text.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+
+        if prompt is not None:
+            logger.warning_once(
+                "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48"
+                " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+            )
+            if not isinstance(prompt, str):
+                raise ValueError(
+                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+                    "Note also that one single text can be provided for conditional image to text generation."
+                )
+
+            model_type = self.model.config.model_type
+
+            if model_type == "git":
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+                input_ids = [self.tokenizer.cls_token_id] + input_ids
+                input_ids = torch.tensor(input_ids).unsqueeze(0)
+                model_inputs.update({"input_ids": input_ids})
+
+            elif model_type == "pix2struct":
+                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+
+            elif model_type != "vision-encoder-decoder":
+                # vision-encoder-decoder does not support conditional generation
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+                model_inputs.update(text_inputs)
+
+            else:
+                raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+        else:
+            model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+            if self.framework == "pt":
+                model_inputs = model_inputs.to(self.torch_dtype)
+
+        if self.model.config.model_type == "git" and prompt is None:
+            model_inputs["input_ids"] = None
+
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
+        #  parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
+        #  the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
+        #  in the `_prepare_model_inputs` method.
+        inputs = model_inputs.pop(self.model.main_input_name)
+        model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        records = []
+        for output_ids in model_outputs:
+            record = {
+                "generated_text": self.tokenizer.decode(
+                    output_ids,
+                    skip_special_tokens=True,
+                )
+            }
+            records.append(record)
+        return records
diff --git a/docs/transformers/build/lib/transformers/pipelines/mask_generation.py b/docs/transformers/build/lib/transformers/pipelines/mask_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87e45b7f8ecb410ba5d0a088188256d59290f0f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/mask_generation.py
@@ -0,0 +1,287 @@
+from collections import defaultdict
+from typing import Optional
+
+from ..image_utils import load_image
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        points_per_batch (*optional*, int, default to 64):
+            Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU
+            memory.
+        output_bboxes_mask (`bool`, *optional*, default to `False`):
+            Whether or not to output the bounding box predictions.
+        output_rle_masks (`bool`, *optional*, default to `False`):
+            Whether or not to output the masks in `RLE` format""",
+)
+class MaskGenerationPipeline(ChunkPipeline):
+    """
+    Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
+    image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to
+    avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
+    same time. Default is `64`.
+
+    The pipeline works in 3 steps:
+        1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point
+           labels.
+            For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes`
+            function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of
+            `points_per_batch`.
+
+        2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once.
+            Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the
+            tensors and models are on the same device.
+
+        3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps
+            are induced:
+                - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks,
+                  resizes them according
+                to the image size, and transforms there to binary masks.
+                - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and
+                  `stability_scores`. Also
+                applies a variety of filters based on non maximum suppression to remove bad masks.
+                - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
+    >>> outputs = generator(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ... )
+
+    >>> outputs = generator(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
+    ... )
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"mask-generation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        requires_backends(self, "vision")
+        requires_backends(self, "torch")
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        forward_params = {}
+        # preprocess args
+        if "points_per_batch" in kwargs:
+            preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+        if "points_per_crop" in kwargs:
+            preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+        if "crops_n_layers" in kwargs:
+            preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+        if "crop_overlap_ratio" in kwargs:
+            preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+        if "crop_n_points_downscale_factor" in kwargs:
+            preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+        # postprocess args
+        if "pred_iou_thresh" in kwargs:
+            forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+        if "stability_score_offset" in kwargs:
+            forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+        if "mask_threshold" in kwargs:
+            forward_params["mask_threshold"] = kwargs["mask_threshold"]
+        if "stability_score_thresh" in kwargs:
+            forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+        if "crops_nms_thresh" in kwargs:
+            postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+        if "output_rle_mask" in kwargs:
+            postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+        if "output_bboxes_mask" in kwargs:
+            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+        return preprocess_kwargs, forward_params, postprocess_kwargs
+
+    def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
+        """
+        Generates binary segmentation masks
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                Image or list of images.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to use when turning the predicted masks into binary values.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                A filtering threshold in `[0,1]` applied on the model's predicted mask quality.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to
+                binarize the model's mask predictions.
+            stability_score_offset (`int`, *optional*, defaults to 1):
+                The amount to shift the cutoff when calculated the stability score.
+            crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                The box IoU cutoff used by non-maximal suppression to filter duplicate masks.
+            crops_n_layers (`int`, *optional*, defaults to 0):
+                If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of
+                layers to run, where each layer has 2**i_layer number of image crops.
+            crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
+                Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+                the image length. Later layers with more crops scale down this overlap.
+            crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
+                The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width,
+                  height)` of the original image. Returns a mask filled with zeros if no object is found.
+                - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of
+                  the "object" described by the label and the mask.
+
+        """
+        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
+
+    def preprocess(
+        self,
+        image,
+        points_per_batch=64,
+        crops_n_layers: int = 0,
+        crop_overlap_ratio: float = 512 / 1500,
+        points_per_crop: Optional[int] = 32,
+        crop_n_points_downscale_factor: Optional[int] = 1,
+        timeout: Optional[float] = None,
+    ):
+        image = load_image(image, timeout=timeout)
+        target_size = self.image_processor.size["longest_edge"]
+        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+        )
+        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+
+        with self.device_placement():
+            if self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+                    model_inputs["image_embeddings"] = image_embeddings
+
+        n_points = grid_points.shape[1]
+        points_per_batch = points_per_batch if points_per_batch is not None else n_points
+
+        if points_per_batch <= 0:
+            raise ValueError(
+                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+                "To return all points at once, set points_per_batch to None"
+            )
+
+        for i in range(0, n_points, points_per_batch):
+            batched_points = grid_points[:, i : i + points_per_batch, :, :]
+            labels = input_labels[:, i : i + points_per_batch]
+            is_last = i == n_points - points_per_batch
+            yield {
+                "input_points": batched_points,
+                "input_labels": labels,
+                "input_boxes": crop_boxes,
+                "is_last": is_last,
+                **model_inputs,
+            }
+
+    def _forward(
+        self,
+        model_inputs,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        input_boxes = model_inputs.pop("input_boxes")
+        is_last = model_inputs.pop("is_last")
+        original_sizes = model_inputs.pop("original_sizes").tolist()
+        reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
+
+        model_outputs = self.model(**model_inputs)
+
+        # post processing happens here in order to avoid CPU GPU copies of ALL the masks
+        low_resolution_masks = model_outputs["pred_masks"]
+        masks = self.image_processor.post_process_masks(
+            low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
+        )
+        iou_scores = model_outputs["iou_scores"]
+        masks, iou_scores, boxes = self.image_processor.filter_masks(
+            masks[0],
+            iou_scores[0],
+            original_sizes[0],
+            input_boxes[0],
+            pred_iou_thresh,
+            stability_score_thresh,
+            mask_threshold,
+            stability_score_offset,
+        )
+        return {
+            "masks": masks,
+            "is_last": is_last,
+            "boxes": boxes,
+            "iou_scores": iou_scores,
+        }
+
+    def postprocess(
+        self,
+        model_outputs,
+        output_rle_mask=False,
+        output_bboxes_mask=False,
+        crops_nms_thresh=0.7,
+    ):
+        all_scores = []
+        all_masks = []
+        all_boxes = []
+        for model_output in model_outputs:
+            all_scores.append(model_output.pop("iou_scores"))
+            all_masks.extend(model_output.pop("masks"))
+            all_boxes.append(model_output.pop("boxes"))
+
+        all_scores = torch.cat(all_scores)
+        all_boxes = torch.cat(all_boxes)
+        output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
+            all_masks, all_scores, all_boxes, crops_nms_thresh
+        )
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            for k, v in output.items():
+                extra[k].append(v)
+
+        optional = {}
+        if output_rle_mask:
+            optional["rle_mask"] = rle_mask
+
+        if output_bboxes_mask:
+            optional["bounding_boxes"] = bounding_boxes
+
+        return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
diff --git a/docs/transformers/build/lib/transformers/pipelines/object_detection.py b/docs/transformers/build/lib/transformers/pipelines/object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..c84f17b2bd6ad0ac2bbbe95a3421e7197a5744c6
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/object_detection.py
@@ -0,0 +1,191 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from ..image_utils import load_image
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    )
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ObjectDetectionPipeline(Pipeline):
+    """
+    Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
+    and their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="facebook/detr-resnet-50")
+    >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
+
+    >>> # x, y  are expressed relative to the top left hand corner.
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"object-detection"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_kwargs = {}
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        return preprocess_params, {}, postprocess_kwargs
+
+    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
+            image, will return a list of dictionaries, if the input is a list of several images, will return a list of
+            list of dictionaries corresponding to each image.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **score** (`float`) -- The score attributed by the model for that label.
+            - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs and "inputs" not in kwargs:
+            kwargs["inputs"] = kwargs.pop("images")
+        return super().__call__(*args, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = torch.IntTensor([[image.height, image.width]])
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        if self.tokenizer is not None:
+            inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        outputs = self.model(**model_inputs)
+        model_outputs = outputs.__class__({"target_size": target_size, **outputs})
+        if self.tokenizer is not None:
+            model_outputs["bbox"] = model_inputs["bbox"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.5):
+        target_size = model_outputs["target_size"]
+        if self.tokenizer is not None:
+            # This is a LayoutLMForTokenClassification variant.
+            # The OCR got the boxes and the model classified the words.
+            height, width = target_size[0].tolist()
+
+            def unnormalize(bbox):
+                return self._get_bounding_box(
+                    torch.Tensor(
+                        [
+                            (width * bbox[0] / 1000),
+                            (height * bbox[1] / 1000),
+                            (width * bbox[2] / 1000),
+                            (height * bbox[3] / 1000),
+                        ]
+                    )
+                )
+
+            scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
+            labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
+            boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
+            keys = ["score", "label", "box"]
+            annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
+        else:
+            # This is a regular ForObjectDetectionModel
+            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
+            raw_annotation = raw_annotations[0]
+            scores = raw_annotation["scores"]
+            labels = raw_annotation["labels"]
+            boxes = raw_annotation["boxes"]
+
+            raw_annotation["scores"] = scores.tolist()
+            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            keys = ["score", "label", "box"]
+            annotation = [
+                dict(zip(keys, vals))
+                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+            ]
+
+        return annotation
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox
diff --git a/docs/transformers/build/lib/transformers/pipelines/pt_utils.py b/docs/transformers/build/lib/transformers/pipelines/pt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19663437cd691efb265770ae007871cafe1275ed
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/pt_utils.py
@@ -0,0 +1,321 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset, IterableDataset
+
+from ..utils.generic import ModelOutput
+
+
+class PipelineDataset(Dataset):
+    def __init__(self, dataset, process, params):
+        self.dataset = dataset
+        self.process = process
+        self.params = params
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        item = self.dataset[i]
+        processed = self.process(item, **self.params)
+        return processed
+
+
+class PipelineIterator(IterableDataset):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for item in loader:
+            yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+                    loader_batch_size (`int`, *optional*):
+                        If specified, the items of `loader` are supposed to come as batch, and are loader_batched here
+                        making it roughly behave as
+
+
+        ```
+        for items in loader:
+            for i in loader_batch_size:
+                item = items[i]
+                yield infer(item, **params)
+        ```"""
+        self.loader = loader
+        self.infer = infer
+        self.params = params
+        if loader_batch_size == 1:
+            # Let's spare some time by deactivating altogether
+            loader_batch_size = None
+        self.loader_batch_size = loader_batch_size
+
+        # Internal bookkeeping
+        self._loader_batch_index = None
+        self._loader_batch_data = None
+
+    def __len__(self):
+        return len(self.loader)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def loader_batch_item(self):
+        """
+        Return item located at `loader_batch_index` within the current `loader_batch_data`.
+        """
+        if isinstance(self._loader_batch_data, torch.Tensor):
+            # Batch data is simple tensor, just fetch the slice
+            result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0)
+        else:
+            # Batch data is assumed to be BaseModelOutput (or dict)
+            loader_batched = {}
+            for k, element in self._loader_batch_data.items():
+                if isinstance(element, ModelOutput):
+                    # Convert ModelOutput to tuple first
+                    element = element.to_tuple()
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if k in {"hidden_states", "past_key_values", "attentions"} and isinstance(element, tuple):
+                    # Those are stored as lists of tensors so need specific unbatching.
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if element is None:
+                    # This can happen for optional data that get passed around
+                    loader_batched[k] = None
+                elif isinstance(element[self._loader_batch_index], torch.Tensor):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+
+                    loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
+                elif isinstance(element[self._loader_batch_index], np.ndarray):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+                    loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
+                else:
+                    # This is typically a list, so no need to `unsqueeze`.
+                    loader_batched[k] = element[self._loader_batch_index]
+            # Recreate the element by reusing the original class to make it look
+            # batch_size=1
+            result = self._loader_batch_data.__class__(loader_batched)
+        self._loader_batch_index += 1
+        return result
+
+    def __next__(self):
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            # We are currently unrolling a batch so we just need to return
+            # the current item within a batch
+            return self.loader_batch_item()
+
+        # We're out of items within a batch
+        item = next(self.iterator)
+        processed = self.infer(item, **self.params)
+        # We now have a batch of "inferred things".
+        if self.loader_batch_size is not None:
+            # Try to infer the size of the batch
+            if isinstance(processed, torch.Tensor):
+                first_tensor = processed
+            elif isinstance(processed, tuple):
+                first_tensor = processed[0]
+            else:
+                key = list(processed.keys())[0]
+                first_tensor = processed[key]
+
+            if isinstance(first_tensor, list):
+                observed_batch_size = len(first_tensor)
+            else:
+                observed_batch_size = first_tensor.shape[0]
+            if 0 < observed_batch_size < self.loader_batch_size:
+                # could be last batch so we can't unroll as many
+                # elements.
+                self.loader_batch_size = observed_batch_size
+            # Setting internal index to unwrap the batch
+            self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed
+            self._loader_batch_index = 0
+            return self.loader_batch_item()
+        else:
+            # We're not unrolling batches
+            return processed
+
+
+class PipelineChunkIterator(PipelineIterator):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for iterator in loader:
+            for item in iterator:
+                yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+        """
+        super().__init__(loader, infer, params)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        self.subiterator = None
+        return self
+
+    def __next__(self):
+        if self.subiterator is None:
+            "Subiterator None means we haven't started a `preprocess` iterator. so start it"
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+        try:
+            # Try to return next item
+            processed = next(self.subiterator)
+        except StopIteration:
+            # When a preprocess iterator ends, we can start lookig at the next item
+            # ChunkIterator will keep feeding until ALL elements of iterator
+            # all have created their subiterator and have been iterating against.
+            #
+            # Another way to look at it, is we're basically flattening lists of lists
+            # into a single list, but with generators
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+            processed = next(self.subiterator)
+        return processed
+
+
+class PipelinePackIterator(PipelineIterator):
+    """
+    Roughly equivalent to
+
+    ```
+    packed =  []
+    for item in loader:
+        packed.append(item)
+        if item["is_last"]:
+            yield packed
+            packed = []
+    ```
+
+        but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In
+        that case it does
+
+    ```
+    packed =  []
+    for batch in loader:
+        # item is batched
+        for item in batch:
+            packed.append(item)
+            if item["is_last"]:
+                yield packed
+                packed = []
+    ```
+
+        Arguments:
+            loader (`torch.utils.data.DataLoader` or `Iterable`):
+                The iterator that will be used to apply `infer` on.
+            infer (any function):
+                The function to apply of each element of `loader`.
+            params (`dict`):
+                The parameters passed to `infer` along with every item
+            loader_batch_size (`int`, *optional*):
+                If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making
+                it roughly behave as
+
+
+    ```
+    for items in loader:
+        for i in loader_batch_size:
+            item = items[i]
+            yield infer(item, **params)
+    ```"""
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def __next__(self):
+        # Extremely similar to PipelineIterator in its unpacking mechanism
+        # BUT, we have an extra required item which is the presence of `is_last`
+        # That is because everything is flattened by `PipelineChunkIterator` we
+        # need to keep track of how to regroup here in the original `process`
+        # boundaries so that `process` and `postprocess` see the same data.
+
+        # This iterator accumulates items (possibly while unbatching) until it
+        # its a `is_last` and then just passes it on to the caller.
+        is_last = False
+        accumulator = []
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            while self._loader_batch_index < self.loader_batch_size:
+                item = self.loader_batch_item()
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+                if is_last:
+                    return accumulator
+
+        while not is_last:
+            processed = self.infer(next(self.iterator), **self.params)
+            if self.loader_batch_size is not None:
+                if isinstance(processed, torch.Tensor):
+                    first_tensor = processed
+                else:
+                    key = list(processed.keys())[0]
+                    first_tensor = processed[key]
+                if isinstance(first_tensor, list):
+                    observed_batch_size = len(first_tensor)
+                else:
+                    observed_batch_size = first_tensor.shape[0]
+                if 0 < observed_batch_size < self.loader_batch_size:
+                    # could be last batch so we can't unroll as many
+                    # elements.
+                    self.loader_batch_size = observed_batch_size
+                self._loader_batch_data = processed
+                self._loader_batch_index = 0
+                while self._loader_batch_index < self.loader_batch_size:
+                    item = self.loader_batch_item()
+                    is_last = item.pop("is_last")
+                    accumulator.append(item)
+                    if is_last:
+                        return accumulator
+            else:
+                item = processed
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+        return accumulator
+
+
+class KeyDataset(Dataset):
+    def __init__(self, dataset: Dataset, key: str):
+        self.dataset = dataset
+        self.key = key
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return self.dataset[i][self.key]
+
+
+class KeyPairDataset(Dataset):
+    def __init__(self, dataset: Dataset, key1: str, key2: str):
+        self.dataset = dataset
+        self.key1 = key1
+        self.key2 = key2
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}
diff --git a/docs/transformers/build/lib/transformers/pipelines/question_answering.py b/docs/transformers/build/lib/transformers/pipelines/question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d2e7c15d5804f2676fc2460c01878299a47a89
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/question_answering.py
@@ -0,0 +1,679 @@
+import inspect
+import types
+import warnings
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
+from ..modelcard import ModelCard
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    PaddingStrategy,
+    add_end_docstrings,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+    logging,
+)
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+    if is_tokenizers_available():
+        import tokenizers
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+    Dataset = None
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import Dataset
+
+    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+
+def decode_spans(
+    start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
+) -> Tuple:
+    """
+    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
+    answer.
+
+    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+    answer end position being before the starting position. The method supports output the k-best answer through the
+    topk argument.
+
+    Args:
+        start (`np.ndarray`): Individual start probabilities for each token.
+        end (`np.ndarray`): Individual end probabilities for each token.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
+    """
+    # Ensure we have batch axis
+    if start.ndim == 1:
+        start = start[None]
+
+    if end.ndim == 1:
+        end = end[None]
+
+    # Compute the score of each tuple(start, end) to be the real answer
+    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+    # Remove candidate with end < start and end - start > max_answer_len
+    candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+    scores_flat = candidates.flatten()
+    if topk == 1:
+        idx_sort = [np.argmax(scores_flat)]
+    elif len(scores_flat) < topk:
+        idx_sort = np.argsort(-scores_flat)
+    else:
+        idx = np.argpartition(-scores_flat, topk)[0:topk]
+        idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+    starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
+    desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
+    starts = starts[desired_spans]
+    ends = ends[desired_spans]
+    scores = candidates[0, starts, ends]
+
+    return starts, ends, scores
+
+
+def select_starts_ends(
+    start,
+    end,
+    p_mask,
+    attention_mask,
+    min_null_score=1000000,
+    top_k=1,
+    handle_impossible_answer=False,
+    max_answer_len=15,
+):
+    """
+    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
+    `decode_spans()` to generate probabilities for each span to be the actual answer.
+
+    Args:
+        start (`np.ndarray`): Individual start logits for each token.
+        end (`np.ndarray`): Individual end logits for each token.
+        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
+        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
+        min_null_score(`float`): The minimum null (empty) answer score seen so far.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+    """
+    # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+    undesired_tokens = np.abs(np.array(p_mask) - 1)
+
+    if attention_mask is not None:
+        undesired_tokens = undesired_tokens & attention_mask
+
+    # Generate mask
+    undesired_tokens_mask = undesired_tokens == 0.0
+
+    # Make sure non-context indexes in the tensor cannot contribute to the softmax
+    start = np.where(undesired_tokens_mask, -10000.0, start)
+    end = np.where(undesired_tokens_mask, -10000.0, end)
+
+    # Normalize logits and spans to retrieve the answer
+    start = np.exp(start - start.max(axis=-1, keepdims=True))
+    start = start / start.sum()
+
+    end = np.exp(end - end.max(axis=-1, keepdims=True))
+    end = end / end.sum()
+
+    if handle_impossible_answer:
+        min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
+
+    # Mask CLS
+    start[0, 0] = end[0, 0] = 0.0
+
+    starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
+    return starts, ends, scores, min_null_score
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
+    internal [`SquadExample`].
+
+    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
+    supplied arguments.
+    """
+
+    def normalize(self, item):
+        if isinstance(item, SquadExample):
+            return item
+        elif isinstance(item, dict):
+            for k in ["question", "context"]:
+                if k not in item:
+                    raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
+                elif item[k] is None:
+                    raise ValueError(f"`{k}` cannot be None")
+                elif isinstance(item[k], str) and len(item[k]) == 0:
+                    raise ValueError(f"`{k}` cannot be empty")
+
+            return QuestionAnsweringPipeline.create_sample(**item)
+        raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)")
+
+    def __call__(self, *args, **kwargs):
+        # Detect where the actual inputs are
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                inputs = args[0]
+            elif len(args) == 2 and {type(el) for el in args} == {str}:
+                inputs = [{"question": args[0], "context": args[1]}]
+            else:
+                inputs = list(args)
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        elif "X" in kwargs:
+            warnings.warn(
+                "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["X"]
+        elif "data" in kwargs:
+            warnings.warn(
+                "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["data"]
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
+                inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
+            elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
+                if len(kwargs["question"]) != len(kwargs["context"]):
+                    raise ValueError("Questions and contexts don't have the same lengths")
+
+                inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
+            elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
+                inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
+            else:
+                raise ValueError("Arguments can't be understood")
+        else:
+            raise ValueError(f"Unknown arguments {kwargs}")
+
+        # When user is sending a generator we need to trust it's a valid example
+        generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
+        if isinstance(inputs, generator_types):
+            return inputs
+
+        # Normalize inputs
+        if isinstance(inputs, dict):
+            inputs = [inputs]
+        elif isinstance(inputs, Iterable):
+            # Copy to avoid overriding arguments
+            inputs = list(inputs)
+        else:
+            raise ValueError(f"Invalid arguments {kwargs}")
+
+        for i, item in enumerate(inputs):
+            inputs[i] = self.normalize(item)
+
+        return inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class QuestionAnsweringPipeline(ChunkPipeline):
+    """
+    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
+    examples](../task_summary#question-answering) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
+    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
+    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
+    """
+
+    default_input_names = "question,context"
+    handle_impossible_answer = False
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: PreTrainedTokenizer,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            task=task,
+            **kwargs,
+        )
+
+        self._args_parser = QuestionAnsweringArgumentHandler()
+        self.check_model_type(
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+        )
+
+    @staticmethod
+    def create_sample(
+        question: Union[str, List[str]], context: Union[str, List[str]]
+    ) -> Union[SquadExample, List[SquadExample]]:
+        """
+        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
+        logic for converting question(s) and context(s) to [`SquadExample`].
+
+        We currently support extractive question answering.
+
+        Arguments:
+            question (`str` or `List[str]`): The question(s) asked.
+            context (`str` or `List[str]`): The context(s) in which we will look for the answer.
+
+        Returns:
+            One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
+        """
+        if isinstance(question, list):
+            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        topk=None,
+        top_k=None,
+        doc_stride=None,
+        max_answer_len=None,
+        max_seq_len=None,
+        max_question_len=None,
+        handle_impossible_answer=None,
+        align_to_words=None,
+        **kwargs,
+    ):
+        # Set defaults values
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+
+        postprocess_params = {}
+        if topk is not None and top_k is None:
+            warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning)
+            top_k = topk
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+        if align_to_words is not None:
+            postprocess_params["align_to_words"] = align_to_words
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        """
+        Answer the question(s) given as inputs by using the context(s).
+
+        Args:
+            question (`str` or `List[str]`):
+                One or several question(s) (must be used in conjunction with the `context` argument).
+            context (`str` or `List[str]`):
+                One or several context(s) associated with the question(s) (must be used in conjunction with the
+                `question` argument).
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            align_to_words (`bool`, *optional*, defaults to `True`):
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
+                non-space-separated languages (like Japanese or Chinese)
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
+            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
+            - **answer** (`str`) -- The answer to the question.
+        """
+
+        # Convert inputs to features
+        if args:
+            warnings.warn(
+                "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+
+        examples = self._args_parser(*args, **kwargs)
+        if isinstance(examples, (list, tuple)) and len(examples) == 1:
+            return super().__call__(examples[0], **kwargs)
+        return super().__call__(examples, **kwargs)
+
+    def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
+        # XXX: This is special, args_parser will not handle anything generator or dataset like
+        # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
+        # So we still need a little sanitation here.
+        if isinstance(example, dict):
+            example = SquadExample(None, example["question"], example["context"], None, None, None)
+
+        if max_seq_len is None:
+            max_seq_len = min(self.tokenizer.model_max_length, 384)
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 128)
+
+        if doc_stride > max_seq_len:
+            raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
+
+        if not self.tokenizer.is_fast:
+            features = squad_convert_examples_to_features(
+                examples=[example],
+                tokenizer=self.tokenizer,
+                max_seq_length=max_seq_len,
+                doc_stride=doc_stride,
+                max_query_length=max_question_len,
+                padding_strategy=PaddingStrategy.MAX_LENGTH,
+                is_training=False,
+                tqdm_enabled=False,
+            )
+        else:
+            # Define the side we want to truncate / pad and the text/pair sorting
+            question_first = self.tokenizer.padding_side == "right"
+
+            encoded_inputs = self.tokenizer(
+                text=example.question_text if question_first else example.context_text,
+                text_pair=example.context_text if question_first else example.question_text,
+                padding=padding,
+                truncation="only_second" if question_first else "only_first",
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                return_special_tokens_mask=True,
+            )
+            # When the input is too long, it's converted in a batch of inputs with overflowing tokens
+            # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
+            # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
+            # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
+            # "num_span" is the number of output samples generated from the overflowing tokens.
+            num_spans = len(encoded_inputs["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            p_mask = [
+                [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
+                for span_id in range(num_spans)
+            ]
+
+            features = []
+            for span_idx in range(num_spans):
+                input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
+                attention_mask_span_idx = (
+                    encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
+                )
+                token_type_ids_span_idx = (
+                    encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
+                )
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+                submask = p_mask[span_idx]
+                features.append(
+                    SquadFeatures(
+                        input_ids=input_ids_span_idx,
+                        attention_mask=attention_mask_span_idx,
+                        token_type_ids=token_type_ids_span_idx,
+                        p_mask=submask,
+                        encoding=encoded_inputs[span_idx],
+                        # We don't use the rest of the values - and actually
+                        # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
+                        cls_index=None,
+                        token_to_orig_map={},
+                        example_index=0,
+                        unique_id=0,
+                        paragraph_len=0,
+                        token_is_max_context=0,
+                        tokens=[],
+                        start_position=0,
+                        end_position=0,
+                        is_impossible=False,
+                        qas_id=None,
+                    )
+                )
+
+        for i, feature in enumerate(features):
+            fw_args = {}
+            others = {}
+            model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]
+
+            for k, v in feature.__dict__.items():
+                if k in model_input_names:
+                    if self.framework == "tf":
+                        tensor = tf.constant(v)
+                        if tensor.dtype == tf.int64:
+                            tensor = tf.cast(tensor, tf.int32)
+                        fw_args[k] = tf.expand_dims(tensor, 0)
+                    elif self.framework == "pt":
+                        tensor = torch.tensor(v)
+                        if tensor.dtype == torch.int32:
+                            tensor = tensor.long()
+                        fw_args[k] = tensor.unsqueeze(0)
+                else:
+                    others[k] = v
+
+            is_last = i == len(features) - 1
+            yield {"example": example, "is_last": is_last, **fw_args, **others}
+
+    def _forward(self, inputs):
+        example = inputs["example"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        output = self.model(**model_inputs)
+        if isinstance(output, dict):
+            return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
+        else:
+            start, end = output[:2]
+            return {"start": start, "end": end, "example": example, **inputs}
+
+    def postprocess(
+        self,
+        model_outputs,
+        top_k=1,
+        handle_impossible_answer=False,
+        max_answer_len=15,
+        align_to_words=True,
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                start_ = output["start"].to(torch.float32)
+                end_ = output["end"].to(torch.float32)
+            else:
+                start_ = output["start"]
+                end_ = output["end"]
+            example = output["example"]
+            p_mask = output["p_mask"]
+            attention_mask = (
+                output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
+            )
+
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len
+            )
+
+            if not self.tokenizer.is_fast:
+                char_to_word = np.array(example.char_to_word_offset)
+
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                for s, e, score in zip(starts, ends, scores):
+                    token_to_orig_map = output["token_to_orig_map"]
+                    answers.append(
+                        {
+                            "score": score.item(),
+                            "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
+                            "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
+                            "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
+                        }
+                    )
+            else:
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                question_first = bool(self.tokenizer.padding_side == "right")
+                enc = output["encoding"]
+
+                # Encoding was *not* padded, input_ids *might*.
+                # It doesn't make a difference unless we're padding on
+                # the left hand side, since now we have different offsets
+                # everywhere.
+                if self.tokenizer.padding_side == "left":
+                    offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum()
+                else:
+                    offset = 0
+
+                # Sometimes the max probability token is in the middle of a word so:
+                # - we start by finding the right word containing the token with `token_to_word`
+                # - then we convert this word in a character span with `word_to_chars`
+                sequence_index = 1 if question_first else 0
+                for s, e, score in zip(starts, ends, scores):
+                    s = s - offset
+                    e = e - offset
+
+                    start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words)
+
+                    answers.append(
+                        {
+                            "score": score.item(),
+                            "start": start_index,
+                            "end": end_index,
+                            "answer": example.context_text[start_index:end_index],
+                        }
+                    )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
+        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def get_indices(
+        self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
+    ) -> Tuple[int, int]:
+        if align_to_words:
+            try:
+                start_word = enc.token_to_word(s)
+                end_word = enc.token_to_word(e)
+                start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
+                end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
+            except Exception:
+                # Some tokenizers don't really handle words. Keep to offsets then.
+                start_index = enc.offsets[s][0]
+                end_index = enc.offsets[e][1]
+        else:
+            start_index = enc.offsets[s][0]
+            end_index = enc.offsets[e][1]
+        return start_index, end_index
+
+    def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
+        """
+        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
+
+        Args:
+            text (`str`): The actual context to extract the answer from.
+            start (`int`): The answer starting token index.
+            end (`int`): The answer end token index.
+
+        Returns:
+            Dictionary like `{'answer': str, 'start': int, 'end': int}`
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {
+            "answer": " ".join(words),
+            "start": max(0, char_start_idx),
+            "end": min(len(text), char_end_idx),
+        }
diff --git a/docs/transformers/build/lib/transformers/pipelines/table_question_answering.py b/docs/transformers/build/lib/transformers/pipelines/table_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ea7170fed40cbc6f14c8b712741ce570fbf3f7
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/table_question_answering.py
@@ -0,0 +1,443 @@
+import collections
+import types
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    requires_backends,
+)
+from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+
+class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for the TableQuestionAnsweringPipeline
+    """
+
+    def __call__(self, table=None, query=None, **kwargs):
+        # Returns tqa_pipeline_inputs of shape:
+        # [
+        #   {"table": pd.DataFrame, "query": List[str]},
+        #   ...,
+        #   {"table": pd.DataFrame, "query" : List[str]}
+        # ]
+        requires_backends(self, "pandas")
+        import pandas as pd
+
+        if table is None:
+            raise ValueError("Keyword argument `table` cannot be None.")
+        elif query is None:
+            if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
+                tqa_pipeline_inputs = [table]
+            elif isinstance(table, list) and len(table) > 0:
+                if not all(isinstance(d, dict) for d in table):
+                    raise ValueError(
+                        f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
+                    )
+
+                if table[0].get("query") is not None and table[0].get("table") is not None:
+                    tqa_pipeline_inputs = table
+                else:
+                    raise ValueError(
+                        "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`"
+                        f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
+                    )
+            elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
+                return table
+            else:
+                raise ValueError(
+                    "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
+                    f"is {type(table)})"
+                )
+        else:
+            tqa_pipeline_inputs = [{"table": table, "query": query}]
+
+        for tqa_pipeline_input in tqa_pipeline_inputs:
+            if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
+                if tqa_pipeline_input["table"] is None:
+                    raise ValueError("Table cannot be None.")
+
+                tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])
+
+        return tqa_pipeline_inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TableQuestionAnsweringPipeline(Pipeline):
+    """
+    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
+    PyTorch.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
+    >>> table = {
+    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
+    ...     "Stars": ["36542", "4512", "3934"],
+    ...     "Contributors": ["651", "77", "34"],
+    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+    ... }
+    >>> oracle(query="How many stars does the transformers repository have?", table=table)
+    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"table-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
+    """
+
+    default_input_names = "table,query"
+
+    def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._args_parser = args_parser
+
+        if self.framework == "tf":
+            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        else:
+            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+        self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
+            getattr(self.model.config, "num_aggregation_labels", None)
+        )
+        self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None
+
+    def batch_inference(self, **inputs):
+        return self.model(**inputs)
+
+    def sequential_inference(self, **inputs):
+        """
+        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
+        handle conversational query related to a table.
+        """
+        if self.framework == "pt":
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            token_type_ids = inputs["token_type_ids"].to(self.device)
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example.cpu().numpy())  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=input_ids_example.unsqueeze(0),
+                    attention_mask=attention_mask_example.unsqueeze(0),
+                    token_type_ids=token_type_ids_example.unsqueeze(0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                dist_per_token = torch.distributions.Bernoulli(logits=logits)
+                probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
+                    dist_per_token.probs.device
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                for i, p in enumerate(probabilities.squeeze().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = torch.cat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+        else:
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"]
+            attention_mask = inputs["attention_mask"]
+            token_type_ids = inputs["token_type_ids"].numpy()
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example, dtype=np.int32)  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = model_labels
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=np.expand_dims(input_ids_example, axis=0),
+                    attention_mask=np.expand_dims(attention_mask_example, axis=0),
+                    token_type_ids=np.expand_dims(token_type_ids_example, axis=0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast(
+                    attention_mask_example, tf.float32
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                token_type_ids_example = token_type_ids_example
+                for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = tf.concat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0))
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
+
+        - `pipeline(table, query)`
+        - `pipeline(table, [query])`
+        - `pipeline(table=table, query=query)`
+        - `pipeline(table=table, query=[query])`
+        - `pipeline({"table": table, "query": query})`
+        - `pipeline({"table": table, "query": [query]})`
+        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`
+
+        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
+
+        Example:
+
+        ```python
+        data = {
+            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+            "age": ["56", "45", "59"],
+            "number of movies": ["87", "53", "69"],
+            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        ```
+
+        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
+
+        Example:
+
+        ```python
+        import pandas as pd
+
+        table = pd.DataFrame.from_dict(data)
+        ```
+
+        Args:
+            table (`pd.DataFrame` or `Dict`):
+                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
+                See above for an example of dictionary.
+            query (`str` or `List[str]`):
+                Query or list of queries that will be sent to the model alongside the table.
+            sequential (`bool`, *optional*, defaults to `False`):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
+                  or to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate row by row, removing rows from the table.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+
+
+        Return:
+            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
+            keys:
+
+            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
+              be preceded by `AGGREGATOR >`.
+            - **coordinates** (`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
+            - **cells** (`List[str]`) -- List of strings made up of the answer cell values.
+            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
+        """
+        pipeline_inputs = self._args_parser(*args, **kwargs)
+
+        results = super().__call__(pipeline_inputs, **kwargs)
+        if len(results) == 1:
+            return results[0]
+        return results
+
+    def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs):
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = {}
+        if sequential is not None:
+            forward_params["sequential"] = sequential
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None):
+        if truncation is None:
+            if self.type == "tapas":
+                truncation = "drop_rows_to_fit"
+            else:
+                truncation = "do_not_truncate"
+
+        table, query = pipeline_input["table"], pipeline_input["query"]
+        if table.empty:
+            raise ValueError("table is empty")
+        if query is None or query == "":
+            raise ValueError("query is empty")
+        inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding)
+        inputs["table"] = table
+        return inputs
+
+    def _forward(self, model_inputs, sequential=False, **generate_kwargs):
+        table = model_inputs.pop("table")
+
+        if self.type == "tapas":
+            if sequential:
+                outputs = self.sequential_inference(**model_inputs)
+            else:
+                outputs = self.batch_inference(**model_inputs)
+        else:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        inputs = model_outputs["model_inputs"]
+        table = model_outputs["table"]
+        outputs = model_outputs["outputs"]
+        if self.type == "tapas":
+            if self.aggregate:
+                logits, logits_agg = outputs[:2]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
+                answer_coordinates_batch, agg_predictions = predictions
+                aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
+
+                no_agg_label_index = self.model.config.no_aggregation_label_index
+                aggregators_prefix = {
+                    i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
+                }
+            else:
+                logits = outputs[0]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
+                answer_coordinates_batch = predictions[0]
+                aggregators = {}
+                aggregators_prefix = {}
+            answers = []
+            for index, coordinates in enumerate(answer_coordinates_batch):
+                cells = [table.iat[coordinate] for coordinate in coordinates]
+                aggregator = aggregators.get(index, "")
+                aggregator_prefix = aggregators_prefix.get(index, "")
+                answer = {
+                    "answer": aggregator_prefix + ", ".join(cells),
+                    "coordinates": coordinates,
+                    "cells": [table.iat[coordinate] for coordinate in coordinates],
+                }
+                if aggregator:
+                    answer["aggregator"] = aggregator
+
+                answers.append(answer)
+            if len(answer) == 0:
+                raise PipelineException("Empty answer")
+        else:
+            answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]
+
+        return answers if len(answers) > 1 else answers[0]
diff --git a/docs/transformers/build/lib/transformers/pipelines/text2text_generation.py b/docs/transformers/build/lib/transformers/pipelines/text2text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0adefdffb9fff0ad85ffba2cbeb0eca7a5fee487
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/text2text_generation.py
@@ -0,0 +1,382 @@
+import enum
+import warnings
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    TEXT = 1
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class Text2TextGenerationPipeline(Pipeline):
+    """
+    Pipeline for text to text generation using seq2seq models.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
+    >>> generator(
+    ...     "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
+    ... )
+    [{'generated_text': 'question: Who created the RuPERTa-base?'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"text2text-generation"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
+    parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    text2text_generator = pipeline("text2text-generation")
+    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "generated"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(
+        self,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        truncation=None,
+        stop_sequence=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = generate_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                warnings.warn(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        return True
+
+    def _parse_and_tokenize(self, *args, truncation):
+        prefix = self.prefix if self.prefix is not None else ""
+        if isinstance(args[0], list):
+            if self.tokenizer.pad_token_id is None:
+                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
+            args = ([prefix + arg for arg in args[0]],)
+            padding = True
+
+        elif isinstance(args[0], str):
+            args = (prefix + args[0],)
+            padding = False
+        else:
+            raise ValueError(
+                f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
+            )
+        inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework)
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if "token_type_ids" in inputs:
+            del inputs["token_type_ids"]
+        return inputs
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Generate the output text(s) using text(s) given as inputs.
+
+        Args:
+            args (`str` or `List[str]`):
+                Input text for the encoder.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
+                The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
+                (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
+                max_length instead of throwing an error down the line.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+
+        result = super().__call__(*args, **kwargs)
+        if (
+            isinstance(args[0], list)
+            and all(isinstance(el, str) for el in args[0])
+            and all(len(res) == 1 for res in result)
+        ):
+            return [res[0] for res in result]
+        return result
+
+    def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
+        inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs)
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.framework == "pt":
+            in_b, input_length = model_inputs["input_ids"].shape
+        elif self.framework == "tf":
+            in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()
+
+        self.check_inputs(
+            input_length,
+            generate_kwargs.get("min_length", self.generation_config.min_length),
+            generate_kwargs.get("max_length", self.generation_config.max_length),
+        )
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        output_ids = self.model.generate(**model_inputs, **generate_kwargs)
+        out_b = output_ids.shape[0]
+        if self.framework == "pt":
+            output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
+        elif self.framework == "tf":
+            output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:]))
+        return {"output_ids": output_ids}
+
+    def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
+        records = []
+        for output_ids in model_outputs["output_ids"][0]:
+            if return_type == ReturnType.TENSORS:
+                record = {f"{self.return_name}_token_ids": output_ids}
+            elif return_type == ReturnType.TEXT:
+                record = {
+                    f"{self.return_name}_text": self.tokenizer.decode(
+                        output_ids,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                    )
+                }
+            records.append(record)
+        return records
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class SummarizationPipeline(Text2TextGenerationPipeline):
+    """
+    Summarize news articles and other documents.
+
+    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"summarization"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
+    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
+    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
+    of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    # use bart in pytorch
+    summarizer = pipeline("summarization")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+
+    # use t5 in tf
+    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "summary"
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Summarize the text(s) given as inputs.
+
+        Args:
+            documents (*str* or `List[str]`):
+                One or several articles (or one list of articles) to summarize.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
+            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the summary.
+        """
+        return super().__call__(*args, **kwargs)
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        if max_length < min_length:
+            logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.")
+
+        if input_length < max_length:
+            logger.warning(
+                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+                "a summarization task, where outputs shorter than the input are typically wanted, you might "
+                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length // 2})"
+            )
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TranslationPipeline(Text2TextGenerationPipeline):
+    """
+    Translates from one language to another.
+
+    This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"translation_xx_to_yy"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
+    For a list of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    en_fr_translator = pipeline("translation_en_to_fr")
+    en_fr_translator("How old are you?")
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "translation"
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        if input_length > 0.9 * max_length:
+            logger.warning(
+                f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider "
+                "increasing your max_length manually, e.g. translator('...', max_length=400)"
+            )
+        return True
+
+    def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
+        if getattr(self.tokenizer, "_build_translation_inputs", None):
+            return self.tokenizer._build_translation_inputs(
+                *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
+            )
+        else:
+            return super()._parse_and_tokenize(*args, truncation=truncation)
+
+    def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs):
+        preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs)
+        if src_lang is not None:
+            preprocess_params["src_lang"] = src_lang
+        if tgt_lang is not None:
+            preprocess_params["tgt_lang"] = tgt_lang
+        if src_lang is None and tgt_lang is None:
+            # Backward compatibility, direct arguments use is preferred.
+            task = kwargs.get("task", self.task)
+            items = task.split("_")
+            if task and len(items) == 4:
+                # translation, XX, to YY
+                preprocess_params["src_lang"] = items[1]
+                preprocess_params["tgt_lang"] = items[3]
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Translate the text(s) given as inputs.
+
+        Args:
+            args (`str` or `List[str]`):
+                Texts to be translated.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            src_lang (`str`, *optional*):
+                The language of the input. Might be required for multilingual models. Will not have any effect for
+                single pair translation models
+            tgt_lang (`str`, *optional*):
+                The language of the desired output. Might be required for multilingual models. Will not have any effect
+                for single pair translation models
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
+            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
+              token ids of the translation.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/docs/transformers/build/lib/transformers/pipelines/text_classification.py b/docs/transformers/build/lib/transformers/pipelines/text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..dadb29c386b41e4ca3bd1a49ee103308c3f02174
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/text_classification.py
@@ -0,0 +1,236 @@
+import inspect
+import warnings
+from typing import Dict
+
+import numpy as np
+
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        return_all_scores (`bool`, *optional*, defaults to `False`):
+            Whether to return all prediction scores or just the one of the predicted class.
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output. In case of regression tasks, will not
+              apply any function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
+    examples](../task_summary#sequence-classification) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+    >>> classifier("This movie is disgustingly good !")
+    [{'label': 'POSITIVE', 'score': 1.0}]
+
+    >>> classifier("Director tried too much.")
+    [{'label': 'NEGATIVE', 'score': 0.996}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
+
+    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
+    over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression
+    tasks (`model.config.problem_type == "regression"`), will not apply any function on the output.
+
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
+    """
+
+    return_all_scores = False
+    function_to_apply = ClassificationFunction.NONE
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction[function_to_apply.upper()]
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs, **kwargs):
+        """
+        Classify the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
+                One or several texts to classify. In order to use text pairs for your classification, you can send a
+                dictionary containing `{"text", "text_pair"}` keys, or a list of those.
+            top_k (`int`, *optional*, defaults to `1`):
+                How many results to return.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If problem type is regression, will not apply any function on the output.
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+
+            If `top_k` is used, one such dictionary is returned per label.
+        """
+        inputs = (inputs,)
+        result = super().__call__(*inputs, **kwargs)
+        # TODO try and retrieve it in a nicer way from _sanitize_parameters.
+        _legacy = "top_k" not in kwargs
+        if isinstance(inputs[0], str) and _legacy:
+            # This pipeline is odd, and return a list when single item is run
+            return [result]
+        else:
+            return result
+
+    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
+        return_tensors = self.framework
+        if isinstance(inputs, dict):
+            return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
+            # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
+            return self.tokenizer(
+                text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
+            )
+        elif isinstance(inputs, list):
+            # This is likely an invalid usage of the pipeline attempting to pass text pairs.
+            raise ValueError(
+                "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
+                ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
+            )
+        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+
+    def _forward(self, model_inputs):
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
+        # `_legacy` is used to determine if we're running the naked pipeline and in backward
+        # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
+        # the more natural result containing the list.
+        # Default value before `set_parameters`
+        if function_to_apply is None:
+            if self.model.config.problem_type == "regression":
+                function_to_apply = ClassificationFunction.NONE
+            elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        outputs = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            # To enable using fp16 and bf16
+            outputs = outputs.float().numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        if top_k == 1 and _legacy:
+            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        if not _legacy:
+            dict_scores.sort(key=lambda x: x["score"], reverse=True)
+            if top_k is not None:
+                dict_scores = dict_scores[:top_k]
+        return dict_scores
diff --git a/docs/transformers/build/lib/transformers/pipelines/text_generation.py b/docs/transformers/build/lib/transformers/pipelines/text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c81e4cc7cea56ad82665eb256abac269b134503
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/text_generation.py
@@ -0,0 +1,496 @@
+import enum
+import itertools
+import types
+from typing import Dict
+
+from ..utils import ModelOutput, add_end_docstrings, is_tf_available, is_torch_available
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: Dict):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        self.messages = messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TextGenerationPipeline(Pipeline):
+    """
+    Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
+    specified text prompt. When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="openai-community/gpt2")
+    >>> generator("I can't believe you did such a ", do_sample=False)
+    [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
+
+    >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
+    >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta")
+    >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string
+    >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2)
+    [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"text-generation"`.
+
+    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
+    objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation)
+    and the list of [conversational models](https://huggingface.co/models?other=conversational)
+    on [huggingface.co/models].
+    """
+
+    # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+    # in https://github.com/rusiaaman/XLNet-gen#methodology
+    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+
+    XL_PREFIX = """
+    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
+    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
+    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
+    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
+    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
+    begging for his blessing. <eod> </s> <eos>
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        )
+        if "prefix" not in self._preprocess_params:
+            # This is very specific. The logic is quite complex and needs to be done
+            # as a "default".
+            # It also defines both some preprocess_kwargs and generate_kwargs
+            # which is why we cannot put them in their respective methods.
+            prefix = None
+            if self.prefix is not None:
+                prefix = self.prefix
+            if prefix is None and self.model.__class__.__name__ in [
+                "XLNetLMHeadModel",
+                "TransfoXLLMHeadModel",
+                "TFXLNetLMHeadModel",
+                "TFTransfoXLLMHeadModel",
+            ]:
+                # For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
+                prefix = self.XL_PREFIX
+            if prefix is not None:
+                # Recalculate some generate_kwargs linked to prefix.
+                preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params)
+                self._preprocess_params = {**self._preprocess_params, **preprocess_params}
+                self._forward_params = {**self._forward_params, **forward_params}
+
+    def _sanitize_parameters(
+        self,
+        return_full_text=None,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        prefix=None,
+        handle_long_generation=None,
+        stop_sequence=None,
+        truncation=None,
+        max_length=None,
+        continue_final_message=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+
+        add_special_tokens = False
+        if "add_special_tokens" in generate_kwargs:
+            add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens")
+
+        if "padding" in generate_kwargs:
+            preprocess_params["padding"] = generate_kwargs.pop("padding")
+
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        if max_length is not None:
+            preprocess_params["max_length"] = max_length
+            generate_kwargs["max_length"] = max_length
+
+        if prefix is not None:
+            preprocess_params["prefix"] = prefix
+        if prefix:
+            prefix_inputs = self.tokenizer(
+                prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
+            )
+            generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
+
+        if handle_long_generation is not None:
+            if handle_long_generation not in {"hole"}:
+                raise ValueError(
+                    f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
+                    " [None, 'hole']"
+                )
+            preprocess_params["handle_long_generation"] = handle_long_generation
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        preprocess_params.update(generate_kwargs)
+        forward_params = generate_kwargs
+
+        postprocess_params = {}
+        if return_full_text is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_full_text`")
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            generate_kwargs["eos_token_id"] = stop_sequence_ids
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
+    def _parse_and_tokenize(self, *args, **kwargs):
+        """
+        Parse arguments and tokenize
+        """
+        # Parse arguments
+        if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
+            kwargs.update({"add_space_before_punct_symbol": True})
+
+        return super()._parse_and_tokenize(*args, **kwargs)
+
+    def __call__(self, text_inputs, **kwargs):
+        """
+        Complete the prompt(s) given as inputs.
+
+        Args:
+            text_inputs (`str`, `List[str]`, List[Dict[str, str]], or `List[List[Dict[str, str]]]`):
+                One or several prompts (or one list of prompts) to complete. If strings or a list of string are
+                passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list
+                of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed,
+                the model's chat template will be used to format them before passing them to the model.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+            prefix (`str`, *optional*):
+                Prefix added to prompt.
+            handle_long_generation (`str`, *optional*):
+                By default, this pipelines does not handle long generation (ones that exceed in one form or the other
+                the model maximum length). There is no perfect way to address this (more info
+                :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
+                strategies to work around that problem depending on your use case.
+
+                - `None` : default strategy where nothing in particular happens
+                - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
+                  truncate a lot of the prompt and not suitable when generation exceed the model capacity)
+            generate_kwargs (`dict`, *optional*):
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+        if isinstance(
+            text_inputs,
+            (list, tuple, types.GeneratorType, KeyDataset)
+            if is_torch_available()
+            else (list, tuple, types.GeneratorType),
+        ):
+            if isinstance(text_inputs, types.GeneratorType):
+                text_inputs, _ = itertools.tee(text_inputs)
+                text_inputs, first_item = (x for x in text_inputs), next(_)
+            else:
+                first_item = text_inputs[0]
+            if isinstance(first_item, (list, tuple, dict)):
+                # We have one or more prompts in list-of-dicts format, so this is chat mode
+                if isinstance(first_item, dict):
+                    return super().__call__(Chat(text_inputs), **kwargs)
+                else:
+                    chats = (Chat(chat) for chat in text_inputs)  # 🐈 🐈 🐈
+                    if isinstance(text_inputs, types.GeneratorType):
+                        return super().__call__(chats, **kwargs)
+                    else:
+                        return super().__call__(list(chats), **kwargs)
+        return super().__call__(text_inputs, **kwargs)
+
+    def preprocess(
+        self,
+        prompt_text,
+        prefix="",
+        handle_long_generation=None,
+        add_special_tokens=None,
+        truncation=None,
+        padding=None,
+        max_length=None,
+        continue_final_message=None,
+        **generate_kwargs,
+    ):
+        # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
+        tokenizer_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "truncation": truncation,
+            "padding": padding,
+            "max_length": max_length,
+        }
+        tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None}
+
+        if isinstance(prompt_text, Chat):
+            tokenizer_kwargs.pop("add_special_tokens", None)  # ignore add_special_tokens on chats
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+            inputs = self.tokenizer.apply_chat_template(
+                prompt_text.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_dict=True,
+                return_tensors=self.framework,
+                **tokenizer_kwargs,
+            )
+        else:
+            inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs)
+
+        inputs["prompt_text"] = prompt_text
+
+        if handle_long_generation == "hole":
+            cur_len = inputs["input_ids"].shape[-1]
+            if "max_new_tokens" in generate_kwargs:
+                new_tokens = generate_kwargs["max_new_tokens"]
+            else:
+                new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
+                if new_tokens < 0:
+                    raise ValueError("We cannot infer how many new tokens are expected")
+            if cur_len + new_tokens > self.tokenizer.model_max_length:
+                keep_length = self.tokenizer.model_max_length - new_tokens
+                if keep_length <= 0:
+                    raise ValueError(
+                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
+                        " models max length"
+                    )
+
+                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
+                if "attention_mask" in inputs:
+                    inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:]
+
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+            in_b = 1
+        else:
+            in_b = input_ids.shape[0]
+        prompt_text = model_inputs.pop("prompt_text")
+
+        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        output = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
+
+        if isinstance(output, ModelOutput):
+            generated_sequence = output.sequences
+            other_outputs = {k: v for k, v in output.items() if k != "sequences"}
+            out_b = generated_sequence.shape[0]
+
+            if self.framework == "pt":
+                for key, value in other_outputs.items():
+                    if isinstance(value, torch.Tensor) and value.shape[0] == out_b:
+                        other_outputs[key] = value.reshape(in_b, out_b // in_b, *value.shape[1:])
+                    if isinstance(value, tuple) and len(value[0]) == out_b:
+                        value = torch.stack(value).swapaxes(0, 1)
+                        other_outputs[key] = value
+            elif self.framework == "tf":
+                for key, value in other_outputs.items():
+                    if isinstance(value, tf.Tensor) and value.shape[0] == out_b:
+                        other_outputs[key] = tf.reshape(value, (in_b, out_b // in_b, *value.shape[1:]))
+                    if isinstance(value, tuple) and len(value[0]) == out_b:
+                        value = tf.stack(value).swapaxes(0, 1)
+                        other_outputs[key] = value
+        else:
+            generated_sequence = output
+            other_outputs = {}
+
+        out_b = generated_sequence.shape[0]
+        if self.framework == "pt":
+            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+        elif self.framework == "tf":
+            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+
+        model_outputs = {
+            "generated_sequence": generated_sequence,
+            "input_ids": input_ids,
+            "prompt_text": prompt_text,
+        }
+        model_outputs.update(other_outputs)
+        return model_outputs
+
+    def postprocess(
+        self,
+        model_outputs,
+        return_type=ReturnType.FULL_TEXT,
+        clean_up_tokenization_spaces=True,
+        continue_final_message=None,
+    ):
+        generated_sequence = model_outputs["generated_sequence"][0]
+        input_ids = model_outputs["input_ids"]
+        prompt_text = model_outputs["prompt_text"]
+        generated_sequence = generated_sequence.numpy().tolist()
+        records = []
+        other_outputs = model_outputs.get("additional_outputs", {})
+        splitted_keys = {}
+        if other_outputs:
+            if self.framework == "pt":
+                for k, v in other_outputs.items():
+                    if isinstance(v, torch.Tensor) and v.shape[0] == len(generated_sequence):
+                        splitted_keys[k] = v.numpy().tolist()
+            elif self.framework == "tf":
+                for k, v in other_outputs.items():
+                    if isinstance(v, tf.Tensor) and v.shape[0] == len(generated_sequence):
+                        splitted_keys[k] = v.numpy().tolist()
+
+        for idx, sequence in enumerate(generated_sequence):
+            if return_type == ReturnType.TENSORS:
+                record = {"generated_token_ids": sequence}
+            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+                # Decode text
+                text = self.tokenizer.decode(
+                    sequence,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                )
+
+                # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
+                if input_ids is None:
+                    prompt_length = 0
+                else:
+                    prompt_length = len(
+                        self.tokenizer.decode(
+                            input_ids[0],
+                            skip_special_tokens=True,
+                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                        )
+                    )
+
+                all_text = text[prompt_length:]
+                if return_type == ReturnType.FULL_TEXT:
+                    if isinstance(prompt_text, str):
+                        all_text = prompt_text + all_text
+                    elif isinstance(prompt_text, Chat):
+                        if continue_final_message is None:
+                            # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                            # default because very few models support multiple separate, consecutive assistant messages
+                            continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                        if continue_final_message:
+                            # With assistant prefill, concat onto the end of the last message
+                            all_text = list(prompt_text.messages)[:-1] + [
+                                {
+                                    "role": prompt_text.messages[-1]["role"],
+                                    "content": prompt_text.messages[-1]["content"] + all_text,
+                                }
+                            ]
+                        else:
+                            # When we're not starting from a prefill, the output is a new assistant message
+                            all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
+                record = {"generated_text": all_text}
+                for key, values in splitted_keys.items():
+                    record[key] = values[idx]
+            records.append(record)
+
+        return records
diff --git a/docs/transformers/build/lib/transformers/pipelines/text_to_audio.py b/docs/transformers/build/lib/transformers/pipelines/text_to_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0fbe5f115d88b40ca1cf3edf5859eec0c7b9324
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/text_to_audio.py
@@ -0,0 +1,221 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.from typing import List, Union
+from typing import List, Union
+
+from ..utils import is_torch_available
+from .base import Pipeline
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+    from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
+
+DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
+
+
+class TextToAudioPipeline(Pipeline):
+    """
+    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
+    pipeline generates an audio file from an input text and optional other conditional inputs.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(model="suno/bark-small")
+    >>> output = pipe("Hey it's HuggingFace on the phone!")
+
+    >>> audio = output["audio"]
+    >>> sampling_rate = output["sampling_rate"]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    <Tip>
+
+    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
+    [`TextToAudioPipeline.__call__.generate_kwargs`].
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")
+
+    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
+    >>> generate_kwargs = {
+    ...     "do_sample": True,
+    ...     "temperature": 0.7,
+    ...     "max_new_tokens": 35,
+    ... }
+
+    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
+    ```
+
+    </Tip>
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
+    `"text-to-audio"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
+    """
+
+    def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
+
+        self.vocoder = None
+        if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
+            self.vocoder = (
+                SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
+                if vocoder is None
+                else vocoder
+            )
+
+        self.sampling_rate = sampling_rate
+        if self.vocoder is not None:
+            self.sampling_rate = self.vocoder.config.sampling_rate
+
+        if self.sampling_rate is None:
+            # get sampling_rate from config and generation config
+
+            config = self.model.config
+            gen_config = self.model.__dict__.get("generation_config", None)
+            if gen_config is not None:
+                config.update(gen_config.to_dict())
+
+            for sampling_rate_name in ["sample_rate", "sampling_rate"]:
+                sampling_rate = getattr(config, sampling_rate_name, None)
+                if sampling_rate is not None:
+                    self.sampling_rate = sampling_rate
+
+    def preprocess(self, text, **kwargs):
+        if isinstance(text, str):
+            text = [text]
+
+        if self.model.config.model_type == "bark":
+            # bark Tokenizer is called with BarkProcessor which uses those kwargs
+            new_kwargs = {
+                "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "add_special_tokens": False,
+                "return_attention_mask": True,
+                "return_token_type_ids": False,
+                "padding": "max_length",
+            }
+
+            # priority is given to kwargs
+            new_kwargs.update(kwargs)
+
+            kwargs = new_kwargs
+
+        output = self.tokenizer(text, **kwargs, return_tensors="pt")
+
+        return output
+
+    def _forward(self, model_inputs, **kwargs):
+        # we expect some kwargs to be additional tensors which need to be on the right device
+        kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
+        forward_params = kwargs["forward_params"]
+        generate_kwargs = kwargs["generate_kwargs"]
+
+        if self.model.can_generate():
+            # we expect some kwargs to be additional tensors which need to be on the right device
+            generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            # generate_kwargs get priority over forward_params
+            forward_params.update(generate_kwargs)
+
+            output = self.model.generate(**model_inputs, **forward_params)
+        else:
+            if len(generate_kwargs):
+                raise ValueError(
+                    "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non "
+                    "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. "
+                    f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}"
+                )
+            output = self.model(**model_inputs, **forward_params)[0]
+
+        if self.vocoder is not None:
+            # in that case, the output is a spectrogram that needs to be converted into a waveform
+            output = self.vocoder(output)
+
+        return output
+
+    def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
+        """
+        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
+
+        Args:
+            text_inputs (`str` or `List[str]`):
+                The text(s) to generate.
+            forward_params (`dict`, *optional*):
+                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
+                underlying model.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
+                only passed to the underlying model if the latter is a generative model.
+
+        Return:
+            A `dict` or a list of `dict`: The dictionaries have two keys:
+
+            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
+            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
+        """
+        return super().__call__(text_inputs, **forward_params)
+
+    def _sanitize_parameters(
+        self,
+        preprocess_params=None,
+        forward_params=None,
+        generate_kwargs=None,
+    ):
+        if self.assistant_model is not None:
+            generate_kwargs["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            generate_kwargs["tokenizer"] = self.tokenizer
+            generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer
+
+        params = {
+            "forward_params": forward_params if forward_params else {},
+            "generate_kwargs": generate_kwargs if generate_kwargs else {},
+        }
+
+        if preprocess_params is None:
+            preprocess_params = {}
+        postprocess_params = {}
+
+        return preprocess_params, params, postprocess_params
+
+    def postprocess(self, waveform):
+        output_dict = {}
+        if isinstance(waveform, dict):
+            waveform = waveform["waveform"]
+        elif isinstance(waveform, tuple):
+            waveform = waveform[0]
+        output_dict["audio"] = waveform.to(device="cpu", dtype=torch.float).numpy()
+        output_dict["sampling_rate"] = self.sampling_rate
+
+        return output_dict
diff --git a/docs/transformers/build/lib/transformers/pipelines/token_classification.py b/docs/transformers/build/lib/transformers/pipelines/token_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9256f238148476b4d923c84f884156b4564c93a7
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/token_classification.py
@@ -0,0 +1,576 @@
+import types
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..models.bert.tokenization_bert import BasicTokenizer
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+)
+from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+
+
+class TokenClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for token classification.
+    """
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+            inputs = list(inputs)
+            batch_size = len(inputs)
+        elif isinstance(inputs, str):
+            inputs = [inputs]
+            batch_size = 1
+        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+            return inputs, None
+        else:
+            raise ValueError("At least one input is required.")
+
+        offset_mapping = kwargs.get("offset_mapping")
+        if offset_mapping:
+            if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
+                offset_mapping = [offset_mapping]
+            if len(offset_mapping) != batch_size:
+                raise ValueError("offset_mapping should have the same batch size as the input")
+        return inputs, offset_mapping
+
+
+class AggregationStrategy(ExplicitEnum):
+    """All the valid aggregation strategies for TokenClassificationPipeline"""
+
+    NONE = "none"
+    SIMPLE = "simple"
+    FIRST = "first"
+    AVERAGE = "average"
+    MAX = "max"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        ignore_labels (`List[str]`, defaults to `["O"]`):
+            A list of labels to ignore.
+        grouped_entities (`bool`, *optional*, defaults to `False`):
+            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
+            same entity together in the predictions or not.
+        stride (`int`, *optional*):
+            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
+        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+            The strategy to fuse (or not) tokens based on the model prediction.
+
+                - "none" : Will simply not do any aggregation and simply return raw results from the model
+                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
+                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
+                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
+                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
+                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
+                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
+                  that support that meaning, which is basically tokens separated by a space). These mitigations will
+                  only work on real words, "New york" might still be tagged with two different entities.
+                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Words will simply use the tag of the first token of the word when there
+                  is ambiguity.
+                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
+                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
+                  label is applied.
+                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Word entity will simply be the token with the maximum score.""",
+)
+class TokenClassificationPipeline(ChunkPipeline):
+    """
+    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
+    examples](../task_summary#named-entity-recognition) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
+    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
+    >>> tokens = token_classifier(sentence)
+    >>> tokens
+    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]
+
+    >>> token = tokens[0]
+    >>> # Start and end provide an easy way to highlight words in the original text.
+    >>> sentence[token["start"] : token["end"]]
+    ' jean-baptiste'
+
+    >>> # Some models use the same idea to do part of speech.
+    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
+    >>> syntaxer("My name is Sarah and I live in London")
+    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).
+
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
+    """
+
+    default_input_names = "sequences"
+
+    def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+        )
+
+        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+        self._args_parser = args_parser
+
+    def _sanitize_parameters(
+        self,
+        ignore_labels=None,
+        grouped_entities: Optional[bool] = None,
+        ignore_subwords: Optional[bool] = None,
+        aggregation_strategy: Optional[AggregationStrategy] = None,
+        offset_mapping: Optional[List[Tuple[int, int]]] = None,
+        stride: Optional[int] = None,
+    ):
+        preprocess_params = {}
+        if offset_mapping is not None:
+            preprocess_params["offset_mapping"] = offset_mapping
+
+        postprocess_params = {}
+        if grouped_entities is not None or ignore_subwords is not None:
+            if grouped_entities and ignore_subwords:
+                aggregation_strategy = AggregationStrategy.FIRST
+            elif grouped_entities and not ignore_subwords:
+                aggregation_strategy = AggregationStrategy.SIMPLE
+            else:
+                aggregation_strategy = AggregationStrategy.NONE
+
+            if grouped_entities is not None:
+                warnings.warn(
+                    "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+            if ignore_subwords is not None:
+                warnings.warn(
+                    "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+
+        if aggregation_strategy is not None:
+            if isinstance(aggregation_strategy, str):
+                aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
+            if (
+                aggregation_strategy
+                in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
+                and not self.tokenizer.is_fast
+            ):
+                raise ValueError(
+                    "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
+                    ' to `"simple"` or use a fast tokenizer.'
+                )
+            postprocess_params["aggregation_strategy"] = aggregation_strategy
+        if ignore_labels is not None:
+            postprocess_params["ignore_labels"] = ignore_labels
+        if stride is not None:
+            if stride >= self.tokenizer.model_max_length:
+                raise ValueError(
+                    "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
+                )
+            if aggregation_strategy == AggregationStrategy.NONE:
+                raise ValueError(
+                    "`stride` was provided to process all the text but `aggregation_strategy="
+                    f'"{aggregation_strategy}"`, please select another one instead.'
+                )
+            else:
+                if self.tokenizer.is_fast:
+                    tokenizer_params = {
+                        "return_overflowing_tokens": True,
+                        "padding": True,
+                        "stride": stride,
+                    }
+                    preprocess_params["tokenizer_params"] = tokenizer_params
+                else:
+                    raise ValueError(
+                        "`stride` was provided to process all the text but you're using a slow tokenizer."
+                        " Please use a fast tokenizer."
+                    )
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+        """
+        Classify each token of the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]`):
+                One or several texts (or one list of texts) for token classification.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
+            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
+            the following keys:
+
+            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
+              want to have the exact string in the original sentence, use `start` and `end`.
+            - **score** (`float`) -- The corresponding probability for `entity`.
+            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
+              *aggregation_strategy* is not `"none"`.
+            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
+              token in the sentence.
+            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+        """
+
+        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
+        if offset_mapping:
+            kwargs["offset_mapping"] = offset_mapping
+
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+        inputs = self.tokenizer(
+            sentence,
+            return_tensors=self.framework,
+            truncation=truncation,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=self.tokenizer.is_fast,
+            **tokenizer_params,
+        )
+        inputs.pop("overflow_to_sample_mapping", None)
+        num_chunks = len(inputs["input_ids"])
+
+        for i in range(num_chunks):
+            if self.framework == "tf":
+                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+            else:
+                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+            if offset_mapping is not None:
+                model_inputs["offset_mapping"] = offset_mapping
+            model_inputs["sentence"] = sentence if i == 0 else None
+            model_inputs["is_last"] = i == num_chunks - 1
+
+            yield model_inputs
+
+    def _forward(self, model_inputs):
+        # Forward
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        is_last = model_inputs.pop("is_last")
+        if self.framework == "tf":
+            logits = self.model(**model_inputs)[0]
+        else:
+            output = self.model(**model_inputs)
+            logits = output["logits"] if isinstance(output, dict) else output[0]
+
+        return {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "sentence": sentence,
+            "is_last": is_last,
+            **model_inputs,
+        }
+
+    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
+        if ignore_labels is None:
+            ignore_labels = ["O"]
+        all_entities = []
+        for model_outputs in all_outputs:
+            if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16):
+                logits = model_outputs["logits"][0].to(torch.float32).numpy()
+            else:
+                logits = model_outputs["logits"][0].numpy()
+
+            sentence = all_outputs[0]["sentence"]
+            input_ids = model_outputs["input_ids"][0]
+            offset_mapping = (
+                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
+            )
+            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
+
+            maxes = np.max(logits, axis=-1, keepdims=True)
+            shifted_exp = np.exp(logits - maxes)
+            scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+            if self.framework == "tf":
+                input_ids = input_ids.numpy()
+                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
+
+            pre_entities = self.gather_pre_entities(
+                sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
+            )
+            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
+            # Filter anything that is in self.ignore_labels
+            entities = [
+                entity
+                for entity in grouped_entities
+                if entity.get("entity", None) not in ignore_labels
+                and entity.get("entity_group", None) not in ignore_labels
+            ]
+            all_entities.extend(entities)
+        num_chunks = len(all_outputs)
+        if num_chunks > 1:
+            all_entities = self.aggregate_overlapping_entities(all_entities)
+        return all_entities
+
+    def aggregate_overlapping_entities(self, entities):
+        if len(entities) == 0:
+            return entities
+        entities = sorted(entities, key=lambda x: x["start"])
+        aggregated_entities = []
+        previous_entity = entities[0]
+        for entity in entities:
+            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
+                current_length = entity["end"] - entity["start"]
+                previous_length = previous_entity["end"] - previous_entity["start"]
+                if current_length > previous_length:
+                    previous_entity = entity
+                elif current_length == previous_length and entity["score"] > previous_entity["score"]:
+                    previous_entity = entity
+            else:
+                aggregated_entities.append(previous_entity)
+                previous_entity = entity
+        aggregated_entities.append(previous_entity)
+        return aggregated_entities
+
+    def gather_pre_entities(
+        self,
+        sentence: str,
+        input_ids: np.ndarray,
+        scores: np.ndarray,
+        offset_mapping: Optional[List[Tuple[int, int]]],
+        special_tokens_mask: np.ndarray,
+        aggregation_strategy: AggregationStrategy,
+    ) -> List[dict]:
+        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+        pre_entities = []
+        for idx, token_scores in enumerate(scores):
+            # Filter special_tokens
+            if special_tokens_mask[idx]:
+                continue
+
+            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+            if offset_mapping is not None:
+                start_ind, end_ind = offset_mapping[idx]
+                if not isinstance(start_ind, int):
+                    if self.framework == "pt":
+                        start_ind = start_ind.item()
+                        end_ind = end_ind.item()
+                word_ref = sentence[start_ind:end_ind]
+                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
+                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
+                ):
+                    # This is a BPE, word aware tokenizer, there is a correct way
+                    # to fuse tokens
+                    is_subword = len(word) != len(word_ref)
+                else:
+                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
+                    if aggregation_strategy in {
+                        AggregationStrategy.FIRST,
+                        AggregationStrategy.AVERAGE,
+                        AggregationStrategy.MAX,
+                    }:
+                        warnings.warn(
+                            "Tokenizer does not support real words, using fallback heuristic",
+                            UserWarning,
+                        )
+                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
+            else:
+                start_ind = None
+                end_ind = None
+                is_subword = False
+
+            pre_entity = {
+                "word": word,
+                "scores": token_scores,
+                "start": start_ind,
+                "end": end_ind,
+                "index": idx,
+                "is_subword": is_subword,
+            }
+            pre_entities.append(pre_entity)
+        return pre_entities
+
+    def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
+            entities = []
+            for pre_entity in pre_entities:
+                entity_idx = pre_entity["scores"].argmax()
+                score = pre_entity["scores"][entity_idx]
+                entity = {
+                    "entity": self.model.config.id2label[entity_idx],
+                    "score": score,
+                    "index": pre_entity["index"],
+                    "word": pre_entity["word"],
+                    "start": pre_entity["start"],
+                    "end": pre_entity["end"],
+                }
+                entities.append(entity)
+        else:
+            entities = self.aggregate_words(pre_entities, aggregation_strategy)
+
+        if aggregation_strategy == AggregationStrategy.NONE:
+            return entities
+        return self.group_entities(entities)
+
+    def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
+        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+        if aggregation_strategy == AggregationStrategy.FIRST:
+            scores = entities[0]["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.MAX:
+            max_entity = max(entities, key=lambda entity: entity["scores"].max())
+            scores = max_entity["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.AVERAGE:
+            scores = np.stack([entity["scores"] for entity in entities])
+            average_scores = np.nanmean(scores, axis=0)
+            entity_idx = average_scores.argmax()
+            entity = self.model.config.id2label[entity_idx]
+            score = average_scores[entity_idx]
+        else:
+            raise ValueError("Invalid aggregation_strategy")
+        new_entity = {
+            "entity": entity,
+            "score": score,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+
+    def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+        """
+        Override tokens from a given word that disagree to force agreement on word boundaries.
+
+        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+        company| B-ENT I-ENT
+        """
+        if aggregation_strategy in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }:
+            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")
+
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+                word_group = [entity]
+        # Last item
+        if word_group is not None:
+            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        return word_entities
+
+    def group_sub_entities(self, entities: List[dict]) -> dict:
+        """
+        Group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        # Get the first entity in the entity group
+        entity = entities[0]["entity"].split("-", 1)[-1]
+        scores = np.nanmean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+
+    def get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            # It's not in B-, I- format
+            # Default to I- for continuation.
+            bi = "I"
+            tag = entity_name
+        return bi, tag
+
+    def group_entities(self, entities: List[dict]) -> List[dict]:
+        """
+        Find and group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+
+        entity_groups = []
+        entity_group_disagg = []
+
+        for entity in entities:
+            if not entity_group_disagg:
+                entity_group_disagg.append(entity)
+                continue
+
+            # If the current entity is similar and adjacent to the previous entity,
+            # append it to the disaggregated entity group
+            # The split is meant to account for the "B" and "I" prefixes
+            # Shouldn't merge if both entities are B-type
+            bi, tag = self.get_tag(entity["entity"])
+            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
+
+            if tag == last_tag and bi != "B":
+                # Modify subword type to be previous_type
+                entity_group_disagg.append(entity)
+            else:
+                # If the current entity is different from the previous entity
+                # aggregate the disaggregated entity group
+                entity_groups.append(self.group_sub_entities(entity_group_disagg))
+                entity_group_disagg = [entity]
+        if entity_group_disagg:
+            # it's the last entity, add it to the entity groups
+            entity_groups.append(self.group_sub_entities(entity_group_disagg))
+
+        return entity_groups
+
+
+NerPipeline = TokenClassificationPipeline
diff --git a/docs/transformers/build/lib/transformers/pipelines/video_classification.py b/docs/transformers/build/lib/transformers/pipelines/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..616eb8def780420d5446d55a28f645b551dbacce
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/video_classification.py
@@ -0,0 +1,184 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from io import BytesIO
+from typing import List, Union
+
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    is_av_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_av_available():
+    import av
+    import numpy as np
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class VideoClassificationPipeline(Pipeline):
+    """
+    Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
+    video.
+
+    This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"video-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "av")
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
+        preprocess_params = {}
+        if frame_sampling_rate is not None:
+            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+        if num_frames is not None:
+            preprocess_params["num_frames"] = num_frames
+
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str]] = None, **kwargs):
+        """
+        Assign labels to the video(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`):
+                The pipeline handles three types of videos:
+
+                - A string containing a http link pointing to a video
+                - A string containing a local path to a video
+
+                The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+                Videos in a batch must all be in the same format: all as http links or all as local paths.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+                The number of frames sampled from the video to run the classification on. If not provided, will default
+                to the number of frames specified in the model configuration.
+            frame_sampling_rate (`int`, *optional*, defaults to 1):
+                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+                frame will be used.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of dictionaries or a list of list of dictionaries containing result. If the input is a single video,
+            will return a list of `top_k` dictionaries, if the input is a list of several videos, will return a list of list of
+            `top_k` dictionaries corresponding to the videos.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "videos" in kwargs:
+            warnings.warn(
+                "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+                FutureWarning,
+            )
+            inputs = kwargs.pop("videos")
+        if inputs is None:
+            raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
+        if num_frames is None:
+            num_frames = self.model.config.num_frames
+
+        if video.startswith("http://") or video.startswith("https://"):
+            video = BytesIO(requests.get(video).content)
+
+        container = av.open(video)
+
+        start_idx = 0
+        end_idx = num_frames * frame_sampling_rate - 1
+        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+
+        video = read_video_pyav(container, indices)
+        video = list(video)
+
+        model_inputs = self.image_processor(video, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        if self.framework == "pt":
+            if function_to_apply == "softmax":
+                probs = model_outputs.logits[0].softmax(-1)
+            elif function_to_apply == "sigmoid":
+                probs = model_outputs.logits[0].sigmoid()
+            else:
+                probs = model_outputs.logits[0]
+            scores, ids = probs.topk(top_k)
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+        return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
diff --git a/docs/transformers/build/lib/transformers/pipelines/visual_question_answering.py b/docs/transformers/build/lib/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d600c9eaf50bc99f6810b0c2836b154cd62ed51
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,200 @@
+from typing import List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class VisualQuestionAnsweringPipeline(Pipeline):
+    """
+    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
+    available in PyTorch.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
+    >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
+    >>> oracle(question="What is she wearing ?", image=image_url)
+    [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
+
+    >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
+    [{'score': 0.948, 'answer': 'hat'}]
+
+    >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
+    [{'score': 0.993, 'answer': 'yes'}]
+
+    >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
+    [{'score': 0.996, 'answer': 'no'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifiers: `"visual-question-answering", "vqa"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str, List["Image.Image"], List[str], "KeyDataset"],
+        question: Union[str, List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
+        below:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline({"image": image, "question": question})`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image`, `List[PIL.Image]` or `KeyDataset`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+                For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset`
+                Example:
+                ```python
+                >>> from transformers.pipelines.pt_utils import KeyDataset
+                >>> from datasets import load_dataset
+
+                >>> dataset = load_dataset("detection-datasets/coco")
+                >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?")
+
+                ```
+            question (`str`, `List[str]`):
+                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
+                If multiple images and questions are given, each and every question will be broadcasted to all images
+                (same effect as a Cartesian product)
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+        Return:
+            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        is_dataset = isinstance(image, KeyDataset)
+        is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image)
+        is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question)
+
+        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+            inputs = {"image": image, "question": question}
+        elif (is_image_batch or is_dataset) and isinstance(question, str):
+            inputs = [{"image": im, "question": question} for im in image]
+        elif isinstance(image, (Image.Image, str)) and is_question_batch:
+            inputs = [{"image": image, "question": q} for q in question]
+        elif (is_image_batch or is_dataset) and is_question_batch:
+            question_image_pairs = []
+            for q in question:
+                for im in image:
+                    question_image_pairs.append({"image": im, "question": q})
+            inputs = question_image_pairs
+        else:
+            """
+            Supports the following format
+            - {"image": image, "question": question}
+            - [{"image": image, "question": question}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        model_inputs = self.tokenizer(
+            inputs["question"],
+            return_tensors=self.framework,
+            padding=padding,
+            truncation=truncation,
+        )
+        image_features = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            image_features = image_features.to(self.torch_dtype)
+        model_inputs.update(image_features)
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.model.can_generate():
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5):
+        if self.model.can_generate():
+            return [
+                {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
+                for output_ids in model_outputs
+            ]
+        else:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+
+            if self.framework == "pt":
+                probs = model_outputs.logits.sigmoid()[0]
+                scores, ids = probs.topk(top_k)
+            else:
+                raise ValueError(f"Unsupported framework: {self.framework}")
+
+            scores = scores.tolist()
+            ids = ids.tolist()
+            return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/docs/transformers/build/lib/transformers/pipelines/zero_shot_audio_classification.py b/docs/transformers/build/lib/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed339a5b7f889c21991eaec6901887ce97d90cd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import UserDict
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    logging,
+)
+from .audio_classification import ffmpeg_read
+from .base import Pipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True))
+class ZeroShotAudioClassificationPipeline(Pipeline):
+    """
+    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
+    provide an audio and a set of `candidate_labels`.
+
+    <Tip warning={true}>
+
+    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.
+
+    </Tip>
+
+    Example:
+    ```python
+    >>> from transformers import pipeline
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("ashraq/esc50")
+    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
+    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
+    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
+    ```
+
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
+    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-audio-classification"`. See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+        # No specific FOR_XXX available yet
+
+    def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
+        """
+        Assign labels to the audio(s) passed as inputs.
+
+        Args:
+            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
+                The pipeline handles three types of inputs:
+                - A string containing a http link pointing to an audio
+                - A string containing a local path to an audio
+                - An audio loaded in numpy
+            candidate_labels (`List[str]`):
+                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
+            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_audio`.
+        """
+        return super().__call__(audios, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        return preprocess_params, {}, {}
+
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
+        if isinstance(audio, str):
+            if audio.startswith("http://") or audio.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                audio = requests.get(audio).content
+            else:
+                with open(audio, "rb") as f:
+                    audio = f.read()
+
+        if isinstance(audio, bytes):
+            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+        if not isinstance(audio, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(audio.shape) != 1:
+            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
+
+        inputs = self.feature_extractor(
+            [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_audio,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            raise ValueError("`tf` framework not supported.")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/docs/transformers/build/lib/transformers/pipelines/zero_shot_classification.py b/docs/transformers/build/lib/transformers/pipelines/zero_shot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4aee3341e30d55691ea74d0e90dd00ba4567c8b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/zero_shot_classification.py
@@ -0,0 +1,268 @@
+import inspect
+from typing import List, Union
+
+import numpy as np
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, logging
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZeroShotClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for zero-shot for text classification by turning each possible label into an NLI
+    premise/hypothesis pair.
+    """
+
+    def _parse_labels(self, labels):
+        if isinstance(labels, str):
+            labels = [label.strip() for label in labels.split(",") if label.strip()]
+        return labels
+
+    def __call__(self, sequences, labels, hypothesis_template):
+        if len(labels) == 0 or len(sequences) == 0:
+            raise ValueError("You must include at least one label and at least one sequence.")
+        if hypothesis_template.format(labels[0]) == hypothesis_template:
+            raise ValueError(
+                (
+                    'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
+                    "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
+                ).format(hypothesis_template)
+            )
+
+        if isinstance(sequences, str):
+            sequences = [sequences]
+
+        sequence_pairs = []
+        for sequence in sequences:
+            sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])
+
+        return sequence_pairs, sequences
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class ZeroShotClassificationPipeline(ChunkPipeline):
+    """
+    NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
+    language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a
+    hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is
+    **much** more flexible.
+
+    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
+    pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
+    label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
+    config's :attr:*~transformers.PretrainedConfig.label2id*.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="facebook/bart-large-mnli")
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["english", "german"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
+    of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).
+    """
+
+    def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
+        self._args_parser = args_parser
+        super().__init__(*args, **kwargs)
+        if self.entailment_id == -1:
+            logger.warning(
+                "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
+                "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
+            )
+
+    @property
+    def entailment_id(self):
+        for label, ind in self.model.config.label2id.items():
+            if label.lower().startswith("entail"):
+                return ind
+        return -1
+
+    def _parse_and_tokenize(
+        self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs
+    ):
+        """
+        Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
+        """
+        return_tensors = self.framework
+        if self.tokenizer.pad_token is None:
+            # Override for tokenizers not supporting padding
+            logger.error(
+                "Tokenizer was not supporting padding necessary for zero-shot, attempting to use "
+                " `pad_token=eos_token`"
+            )
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        try:
+            inputs = self.tokenizer(
+                sequence_pairs,
+                add_special_tokens=add_special_tokens,
+                return_tensors=return_tensors,
+                padding=padding,
+                truncation=truncation,
+            )
+        except Exception as e:
+            if "too short" in str(e):
+                # tokenizers might yell that we want to truncate
+                # to a value that is not even reached by the input.
+                # In that case we don't want to truncate.
+                # It seems there's not a really better way to catch that
+                # exception.
+
+                inputs = self.tokenizer(
+                    sequence_pairs,
+                    add_special_tokens=add_special_tokens,
+                    return_tensors=return_tensors,
+                    padding=padding,
+                    truncation=TruncationStrategy.DO_NOT_TRUNCATE,
+                )
+            else:
+                raise e
+
+        return inputs
+
+    def _sanitize_parameters(self, **kwargs):
+        if kwargs.get("multi_class", None) is not None:
+            kwargs["multi_label"] = kwargs["multi_class"]
+            logger.warning(
+                "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
+                "`multi_class` will be removed in a future version of Transformers."
+            )
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"])
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        postprocess_params = {}
+        if "multi_label" in kwargs:
+            postprocess_params["multi_label"] = kwargs["multi_label"]
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(
+        self,
+        sequences: Union[str, List[str]],
+        *args,
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more
+        information.
+
+        Args:
+            sequences (`str` or `List[str]`):
+                The sequence(s) to classify, will be truncated if the model input is too large.
+            candidate_labels (`str` or `List[str]`):
+                The set of possible class labels to classify each sequence into. Can be a single label, a string of
+                comma-separated labels, or a list of labels.
+            hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
+                The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
+                similar syntax for the candidate label to be inserted into the template. For example, the default
+                template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the
+                model like `"<cls> sequence to classify <sep> This example is sports . <sep>"`. The default template
+                works well in many cases, but it may be worthwhile to experiment with different templates depending on
+                the task setting.
+            multi_label (`bool`, *optional*, defaults to `False`):
+                Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that
+                the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
+                independent and probabilities are normalized for each candidate by doing a softmax of the entailment
+                score vs. the contradiction score.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **sequence** (`str`) -- The sequence for which this is the output.
+            - **labels** (`List[str]`) -- The labels sorted by order of likelihood.
+            - **scores** (`List[float]`) -- The probabilities for each of the labels.
+        """
+        if len(args) == 0:
+            pass
+        elif len(args) == 1 and "candidate_labels" not in kwargs:
+            kwargs["candidate_labels"] = args[0]
+        else:
+            raise ValueError(f"Unable to understand extra arguments {args}")
+
+        return super().__call__(sequences, **kwargs)
+
+    def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
+        sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
+
+        for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
+            model_input = self._parse_and_tokenize([sequence_pair])
+
+            yield {
+                "candidate_label": candidate_label,
+                "sequence": sequences[0],
+                "is_last": i == len(candidate_labels) - 1,
+                **model_input,
+            }
+
+    def _forward(self, inputs):
+        candidate_label = inputs["candidate_label"]
+        sequence = inputs["sequence"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {
+            "candidate_label": candidate_label,
+            "sequence": sequence,
+            "is_last": inputs["is_last"],
+            **outputs,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs, multi_label=False):
+        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        sequences = [outputs["sequence"] for outputs in model_outputs]
+        if self.framework == "pt":
+            logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+        else:
+            logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+        N = logits.shape[0]
+        n = len(candidate_labels)
+        num_sequences = N // n
+        reshaped_outputs = logits.reshape((num_sequences, n, -1))
+
+        if multi_label or len(candidate_labels) == 1:
+            # softmax over the entailment vs. contradiction dim for each label independently
+            entailment_id = self.entailment_id
+            contradiction_id = -1 if entailment_id == 0 else 0
+            entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
+            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
+            scores = scores[..., 1]
+        else:
+            # softmax the "entailment" logits over all candidate labels
+            entail_logits = reshaped_outputs[..., self.entailment_id]
+            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
+
+        top_inds = list(reversed(scores[0].argsort()))
+        return {
+            "sequence": sequences[0],
+            "labels": [candidate_labels[i] for i in top_inds],
+            "scores": scores[0, top_inds].tolist(),
+        }
diff --git a/docs/transformers/build/lib/transformers/pipelines/zero_shot_image_classification.py b/docs/transformers/build/lib/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e49ba0efaafc3cc89200651e0820e4fa8ef7701
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/zero_shot_image_classification.py
@@ -0,0 +1,196 @@
+import warnings
+from collections import UserDict
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+    from ..tf_utils import stable_softmax
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotImageClassificationPipeline(Pipeline):
+    """
+    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
+    provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["animals", "humans", "landscape"],
+    ... )
+    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["black and white", "photorealist", "painting"],
+    ... )
+    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+            candidate_labels (`List[str]`):
+                The candidate labels for this image. They will be formatted using *hypothesis_template*.
+
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the image classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `image`
+        if "images" in kwargs:
+            image = kwargs.pop("images")
+        if image is None:
+            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+        return super().__call__(image, **kwargs)
+
+    def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+        if tokenizer_kwargs is not None:
+            warnings.warn(
+                "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+                FutureWarning,
+            )
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        return preprocess_params, {}, {}
+
+    def preprocess(
+        self,
+        image,
+        candidate_labels=None,
+        hypothesis_template="This is a photo of {}.",
+        timeout=None,
+        tokenizer_kwargs=None,
+    ):
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        tokenizer_default_kwargs = {"padding": True}
+        if "siglip" in self.model.config.model_type:
+            tokenizer_default_kwargs.update(padding="max_length", max_length=64, truncation=True)
+        tokenizer_default_kwargs.update(tokenizer_kwargs)
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, **tokenizer_default_kwargs)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_image,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+        if self.framework == "pt" and "siglip" in self.model.config.model_type:
+            probs = torch.sigmoid(logits).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "pt":
+            probs = logits.softmax(dim=-1).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "tf":
+            probs = stable_softmax(logits, axis=-1)
+            scores = probs.numpy().tolist()
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/docs/transformers/build/lib/transformers/pipelines/zero_shot_object_detection.py b/docs/transformers/build/lib/transformers/pipelines/zero_shot_object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8da7340bcce527f6ef8c013f1f609c341f9857
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/pipelines/zero_shot_object_detection.py
@@ -0,0 +1,235 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image, valid_images
+
+if is_torch_available():
+    import torch
+
+    from transformers.modeling_outputs import BaseModelOutput
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotObjectDetectionPipeline(ChunkPipeline):
+    """
+    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
+    objects when you provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+    >>> detector(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ...     candidate_labels=["cat", "couch"],
+    ... )
+    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]
+
+    >>> detector(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["head", "bird"],
+    ... )
+    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-object-detection"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
+
+    def __call__(
+        self,
+        image: Union[str, "Image.Image", List[Dict[str, Any]]],
+        candidate_labels: Union[str, List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `PIL.Image` or `List[Dict[str, Any]]`):
+                The pipeline handles three types of images:
+
+                - A string containing an http url pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                You can use this parameter to send directly a list of images, or a dataset or a generator like so:
+
+                ```python
+                >>> from transformers import pipeline
+
+                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+                >>> detector(
+                ...     [
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...     ]
+                ... )
+                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
+                ```
+
+
+            candidate_labels (`str` or `List[str]` or `List[List[str]]`):
+                What the model should recognize in the image.
+
+            threshold (`float`, *optional*, defaults to 0.1):
+                The probability necessary to make a prediction.
+
+            top_k (`int`, *optional*, defaults to None):
+                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
+                or higher than the number of predictions available, it will default to the number of predictions.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+
+        Return:
+            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
+            with the following keys:
+
+            - **label** (`str`) -- Text query corresponding to the found object.
+            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
+            - **box** (`Dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
+              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
+        """
+        if "text_queries" in kwargs:
+            candidate_labels = kwargs.pop("text_queries")
+
+        if isinstance(image, (str, Image.Image)):
+            inputs = {"image": image, "candidate_labels": candidate_labels}
+        elif isinstance(image, (list, tuple)) and valid_images(image):
+            return list(
+                super().__call__(
+                    ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)),
+                    **kwargs,
+                )
+            )
+        else:
+            """
+            Supports the following format
+            - {"image": image, "candidate_labels": candidate_labels}
+            - [{"image": image, "candidate_labels": candidate_labels}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            inputs = image
+
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_params = {}
+        if "threshold" in kwargs:
+            postprocess_params["threshold"] = kwargs["threshold"]
+        if "top_k" in kwargs:
+            postprocess_params["top_k"] = kwargs["top_k"]
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        candidate_labels = inputs["candidate_labels"]
+        if isinstance(candidate_labels, str):
+            candidate_labels = candidate_labels.split(",")
+
+        target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
+        for i, candidate_label in enumerate(candidate_labels):
+            text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
+            image_features = self.image_processor(image, return_tensors=self.framework)
+            if self.framework == "pt":
+                image_features = image_features.to(self.torch_dtype)
+            yield {
+                "is_last": i == len(candidate_labels) - 1,
+                "target_size": target_size,
+                "candidate_label": candidate_label,
+                **text_inputs,
+                **image_features,
+            }
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        candidate_label = model_inputs.pop("candidate_label")
+        is_last = model_inputs.pop("is_last")
+
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.1, top_k=None):
+        results = []
+        for model_output in model_outputs:
+            label = model_output["candidate_label"]
+            model_output = BaseModelOutput(model_output)
+            outputs = self.image_processor.post_process_object_detection(
+                outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
+            )[0]
+
+            for index in outputs["scores"].nonzero():
+                score = outputs["scores"][index].item()
+                box = self._get_bounding_box(outputs["boxes"][index][0])
+
+                result = {"score": score, "label": label, "box": box}
+                results.append(result)
+
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        if top_k:
+            results = results[:top_k]
+
+        return results
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox
diff --git a/docs/transformers/build/lib/transformers/quantizers/__init__.py b/docs/transformers/build/lib/transformers/quantizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7117bc2b5d8045380f582ac09338af3f3f3e3f61
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .auto import AutoHfQuantizer, AutoQuantizationConfig, register_quantization_config, register_quantizer
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
diff --git a/docs/transformers/build/lib/transformers/quantizers/auto.py b/docs/transformers/build/lib/transformers/quantizers/auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d326d5030713abb6e3fd5524f602a7e9d638e9e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/auto.py
@@ -0,0 +1,276 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Modifications Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Dict, Optional, Union
+
+from ..models.auto.configuration_auto import AutoConfig
+from ..utils import logging
+from ..utils.quantization_config import (
+    AqlmConfig,
+    AutoRoundConfig,
+    AwqConfig,
+    BitNetConfig,
+    BitsAndBytesConfig,
+    CompressedTensorsConfig,
+    EetqConfig,
+    FbgemmFp8Config,
+    FineGrainedFP8Config,
+    GPTQConfig,
+    HiggsConfig,
+    HqqConfig,
+    QuantizationConfigMixin,
+    QuantizationMethod,
+    QuantoConfig,
+    QuarkConfig,
+    SpQRConfig,
+    TorchAoConfig,
+    VptqConfig,
+)
+from .base import HfQuantizer
+from .quantizer_aqlm import AqlmHfQuantizer
+from .quantizer_auto_round import AutoRoundQuantizer
+from .quantizer_awq import AwqQuantizer
+from .quantizer_bitnet import BitNetHfQuantizer
+from .quantizer_bnb_4bit import Bnb4BitHfQuantizer
+from .quantizer_bnb_8bit import Bnb8BitHfQuantizer
+from .quantizer_compressed_tensors import CompressedTensorsHfQuantizer
+from .quantizer_eetq import EetqHfQuantizer
+from .quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer
+from .quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
+from .quantizer_gptq import GptqHfQuantizer
+from .quantizer_higgs import HiggsHfQuantizer
+from .quantizer_hqq import HqqHfQuantizer
+from .quantizer_quanto import QuantoHfQuantizer
+from .quantizer_quark import QuarkHfQuantizer
+from .quantizer_spqr import SpQRHfQuantizer
+from .quantizer_torchao import TorchAoHfQuantizer
+from .quantizer_vptq import VptqHfQuantizer
+
+
+AUTO_QUANTIZER_MAPPING = {
+    "awq": AwqQuantizer,
+    "bitsandbytes_4bit": Bnb4BitHfQuantizer,
+    "bitsandbytes_8bit": Bnb8BitHfQuantizer,
+    "gptq": GptqHfQuantizer,
+    "aqlm": AqlmHfQuantizer,
+    "quanto": QuantoHfQuantizer,
+    "quark": QuarkHfQuantizer,
+    "eetq": EetqHfQuantizer,
+    "higgs": HiggsHfQuantizer,
+    "hqq": HqqHfQuantizer,
+    "compressed-tensors": CompressedTensorsHfQuantizer,
+    "fbgemm_fp8": FbgemmFp8HfQuantizer,
+    "torchao": TorchAoHfQuantizer,
+    "bitnet": BitNetHfQuantizer,
+    "vptq": VptqHfQuantizer,
+    "spqr": SpQRHfQuantizer,
+    "fp8": FineGrainedFP8HfQuantizer,
+    "auto-round": AutoRoundQuantizer,
+}
+
+AUTO_QUANTIZATION_CONFIG_MAPPING = {
+    "awq": AwqConfig,
+    "bitsandbytes_4bit": BitsAndBytesConfig,
+    "bitsandbytes_8bit": BitsAndBytesConfig,
+    "eetq": EetqConfig,
+    "gptq": GPTQConfig,
+    "aqlm": AqlmConfig,
+    "quanto": QuantoConfig,
+    "quark": QuarkConfig,
+    "hqq": HqqConfig,
+    "compressed-tensors": CompressedTensorsConfig,
+    "fbgemm_fp8": FbgemmFp8Config,
+    "higgs": HiggsConfig,
+    "torchao": TorchAoConfig,
+    "bitnet": BitNetConfig,
+    "vptq": VptqConfig,
+    "spqr": SpQRConfig,
+    "fp8": FineGrainedFP8Config,
+    "auto-round": AutoRoundConfig,
+}
+
+logger = logging.get_logger(__name__)
+
+
+class AutoQuantizationConfig:
+    """
+    The Auto-HF quantization config class that takes care of automatically dispatching to the correct
+    quantization config given a quantization config stored in a dictionary.
+    """
+
+    @classmethod
+    def from_dict(cls, quantization_config_dict: Dict):
+        quant_method = quantization_config_dict.get("quant_method", None)
+        # We need a special care for bnb models to make sure everything is BC ..
+        if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False):
+            suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit"
+            quant_method = QuantizationMethod.BITS_AND_BYTES + suffix
+        elif quant_method is None:
+            raise ValueError(
+                "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized"
+            )
+
+        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+            raise ValueError(
+                f"Unknown quantization type, got {quant_method} - supported types are:"
+                f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
+            )
+
+        target_cls = AUTO_QUANTIZATION_CONFIG_MAPPING[quant_method]
+        return target_cls.from_dict(quantization_config_dict)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        model_config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        if getattr(model_config, "quantization_config", None) is None:
+            raise ValueError(
+                f"Did not found a `quantization_config` in {pretrained_model_name_or_path}. Make sure that the model is correctly quantized."
+            )
+        quantization_config_dict = model_config.quantization_config
+        quantization_config = cls.from_dict(quantization_config_dict)
+        # Update with potential kwargs that are passed through from_pretrained.
+        quantization_config.update(**kwargs)
+        return quantization_config
+
+
+class AutoHfQuantizer:
+    """
+     The Auto-HF quantizer class that takes care of automatically instantiating to the correct
+    `HfQuantizer` given the `QuantizationConfig`.
+    """
+
+    @classmethod
+    def from_config(cls, quantization_config: Union[QuantizationConfigMixin, Dict], **kwargs):
+        # Convert it to a QuantizationConfig if the q_config is a dict
+        if isinstance(quantization_config, dict):
+            quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
+
+        quant_method = quantization_config.quant_method
+
+        # Again, we need a special care for bnb as we have a single quantization config
+        # class for both 4-bit and 8-bit quantization
+        if quant_method == QuantizationMethod.BITS_AND_BYTES:
+            if quantization_config.load_in_8bit:
+                quant_method += "_8bit"
+            else:
+                quant_method += "_4bit"
+
+        if quant_method not in AUTO_QUANTIZER_MAPPING.keys():
+            raise ValueError(
+                f"Unknown quantization type, got {quant_method} - supported types are:"
+                f" {list(AUTO_QUANTIZER_MAPPING.keys())}"
+            )
+
+        target_cls = AUTO_QUANTIZER_MAPPING[quant_method]
+        return target_cls(quantization_config, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        quantization_config = AutoQuantizationConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls.from_config(quantization_config)
+
+    @classmethod
+    def merge_quantization_configs(
+        cls,
+        quantization_config: Union[dict, QuantizationConfigMixin],
+        quantization_config_from_args: Optional[QuantizationConfigMixin],
+    ):
+        """
+        handles situations where both quantization_config from args and quantization_config from model config are present.
+        """
+        if quantization_config_from_args is not None:
+            warning_msg = (
+                "You passed `quantization_config` or equivalent parameters to `from_pretrained` but the model you're loading"
+                " already has a `quantization_config` attribute. The `quantization_config` from the model will be used."
+            )
+        else:
+            warning_msg = ""
+
+        if isinstance(quantization_config, dict):
+            # Convert the config based on the type of quantization_config_from_args (e.g., AutoRoundConfig), which takes priority before automatic configuration dispatch.
+            if isinstance(quantization_config_from_args, AutoRoundConfig):
+                quantization_config = AutoRoundConfig.from_dict(quantization_config)
+            else:
+                quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
+
+        if (
+            isinstance(
+                quantization_config, (GPTQConfig, AwqConfig, AutoRoundConfig, FbgemmFp8Config, CompressedTensorsConfig)
+            )
+            and quantization_config_from_args is not None
+        ):
+            # special case for GPTQ / AWQ / FbgemmFp8 config collision
+            loading_attr_dict = quantization_config_from_args.get_loading_attributes()
+            for attr, val in loading_attr_dict.items():
+                setattr(quantization_config, attr, val)
+
+            warning_msg += f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
+
+        if warning_msg != "":
+            warnings.warn(warning_msg)
+
+        return quantization_config
+
+    @staticmethod
+    def supports_quant_method(quantization_config_dict):
+        quant_method = quantization_config_dict.get("quant_method", None)
+        if quantization_config_dict.get("load_in_8bit", False) or quantization_config_dict.get("load_in_4bit", False):
+            suffix = "_4bit" if quantization_config_dict.get("load_in_4bit", False) else "_8bit"
+            quant_method = QuantizationMethod.BITS_AND_BYTES + suffix
+        elif quant_method is None:
+            raise ValueError(
+                "The model's quantization config from the arguments has no `quant_method` attribute. Make sure that the model has been correctly quantized"
+            )
+
+        if quant_method not in AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
+            logger.warning(
+                f"Unknown quantization type, got {quant_method} - supported types are:"
+                f" {list(AUTO_QUANTIZER_MAPPING.keys())}. Hence, we will skip the quantization. "
+                "To remove the warning, you can delete the quantization_config attribute in config.json"
+            )
+            return False
+        return True
+
+
+def register_quantization_config(method: str):
+    """Register a custom quantization configuration."""
+
+    def register_config_fn(cls):
+        if method in AUTO_QUANTIZATION_CONFIG_MAPPING:
+            raise ValueError(f"Config '{method}' already registered")
+
+        if not issubclass(cls, QuantizationConfigMixin):
+            raise ValueError("Config must extend QuantizationConfigMixin")
+
+        AUTO_QUANTIZATION_CONFIG_MAPPING[method] = cls
+        return cls
+
+    return register_config_fn
+
+
+def register_quantizer(name: str):
+    """Register a custom quantizer."""
+
+    def register_quantizer_fn(cls):
+        if name in AUTO_QUANTIZER_MAPPING:
+            raise ValueError(f"Quantizer '{name}' already registered")
+
+        if not issubclass(cls, HfQuantizer):
+            raise ValueError("Quantizer must extend HfQuantizer")
+
+        AUTO_QUANTIZER_MAPPING[name] = cls
+        return cls
+
+    return register_quantizer_fn
diff --git a/docs/transformers/build/lib/transformers/quantizers/base.py b/docs/transformers/build/lib/transformers/quantizers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ae46a0afbb7a04799d2e9e88bfb158990e5ea3
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/base.py
@@ -0,0 +1,363 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from ..utils import is_torch_available
+from ..utils.quantization_config import QuantizationConfigMixin, QuantizationMethod
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+if is_torch_available():
+    import torch
+    from torch.nn import ModuleList
+else:
+    ModuleList = str
+
+
+class HfQuantizer(ABC):
+    """
+    Abstract class of the HuggingFace quantizer. Supports for now quantizing HF transformers models for inference and/or quantization.
+    This class is used only for transformers.PreTrainedModel.from_pretrained and cannot be easily used outside the scope of that method
+    yet.
+
+    Attributes
+        quantization_config (`transformers.utils.quantization_config.QuantizationConfigMixin`):
+            The quantization config that defines the quantization parameters of your model that you want to quantize.
+        modules_to_not_convert (`List[str]`, *optional*):
+            The list of module names to not convert when quantizing the model.
+        required_packages (`List[str]`, *optional*):
+            The list of required pip packages to install prior to using the quantizer
+        requires_calibration (`bool`):
+            Whether the quantization method requires to calibrate the model before using it.
+        requires_parameters_quantization (`bool`):
+            Whether the quantization method requires to create a new Parameter. For example, for bitsandbytes, it is
+            required to create a new xxxParameter in order to properly quantize the model.
+    """
+
+    requires_calibration = False
+    required_packages = None
+    requires_parameters_quantization = False
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        self.quantization_config = quantization_config
+
+        # -- Handle extra kwargs below --
+        self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        self.pre_quantized = kwargs.pop("pre_quantized", True)
+
+        if not self.pre_quantized and self.requires_calibration:
+            raise ValueError(
+                f"The quantization method {quantization_config.quant_method} does require the model to be pre-quantized."
+                f" You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to "
+                f"pass `pre_quantized=True` while knowing what you are doing."
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        """
+        Some quantization methods require to explicitly set the dtype of the model to a
+        target dtype. You need to override this method in case you want to make sure that behavior is
+        preserved
+
+        Args:
+            torch_dtype (`torch.dtype`):
+                The input dtype that is passed in `from_pretrained`
+        """
+        return torch_dtype
+
+    def update_device_map(self, device_map: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+        """
+        Override this method if you want to pass a override the existing device map with a new
+        one. E.g. for bitsandbytes, since `accelerate` is a hard requirement, if no device_map is
+        passed, the device_map is set to `"auto"``
+
+        Args:
+            device_map (`Union[dict, str]`, *optional*):
+                The device_map that is passed through the `from_pretrained` method.
+        """
+        return device_map
+
+    def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        """
+        Override this method if you want to adjust the `target_dtype` variable used in `from_pretrained`
+        to compute the device_map in case the device_map is a `str`. E.g. for bitsandbytes we force-set `target_dtype`
+        to `torch.int8` and for 4-bit we pass a custom enum `accelerate.CustomDtype.int4`.
+
+        Args:
+            torch_dtype (`torch.dtype`, *optional*):
+                The torch_dtype that is used to compute the device_map.
+        """
+        return torch_dtype
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `missing_keys`.
+
+        Args:
+            missing_keys (`List[str]`, *optional*):
+                The list of missing keys in the checkpoint compared to the state dict of the model
+        """
+        return missing_keys
+
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+        return unexpected_keys
+
+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `missing_keys` after loading the model params,
+        but before the model is post-processed.
+
+        Args:
+            missing_keys (`List[str]`, *optional*):
+                The list of missing keys in the checkpoint compared to the state dict of the model
+        """
+        return missing_keys
+
+    def update_expected_keys(self, model, expected_keys: List[str], loaded_keys: List[str]) -> List[str]:
+        """
+        Override this method if you want to adjust the `update_expected_keys`.
+
+        Args:
+            expected_keys (`List[str]`, *optional*):
+                The list of the expected keys in the initialized model.
+            loaded_keys (`List[str]`, *optional*):
+                The list of the loaded keys in the checkpoint.
+        """
+        return expected_keys
+
+    def get_special_dtypes_update(self, model, torch_dtype: "torch.dtype") -> Dict[str, "torch.dtype"]:
+        """
+        returns dtypes for modules that are not quantized - used for the computation of the device_map in case
+        one passes a str as a device_map. The method will use the `modules_to_not_convert` that is modified
+        in `_process_model_before_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            torch_dtype (`torch.dtype`):
+                The dtype passed in `from_pretrained` method.
+        """
+
+        return {
+            name: torch_dtype
+            for name, _ in model.named_parameters()
+            if any(m in name for m in self.modules_to_not_convert)
+        }
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        """adjust max_memory argument for infer_auto_device_map() if extra memory is needed for quantization"""
+        return max_memory
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        """
+        checks if a loaded state_dict component is part of quantized param + some validation; only defined if
+        requires_parameters_quantization == True for quantization methods that require to create a new parameters
+        for quantization.
+        """
+        return False
+
+    def create_quantized_param(self, *args, **kwargs) -> "torch.nn.Parameter":
+        """
+        takes needed components from state_dict and creates quantized param; only applicable if
+        requires_parameters_quantization == True
+        """
+        if not self.requires_parameters_quantization:
+            raise AttributeError(
+                f"`.create_quantized_param()` method is not supported by quantizer class {self.__class__.__name__}."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        """
+        This method is used to potentially check for potential conflicts with arguments that are
+        passed in `from_pretrained`. You need to define it for all future quantizers that are integrated with transformers.
+        If no explicit check are needed, simply return nothing.
+        """
+        return
+
+    def update_tp_plan(self, config):
+        "updates the tp plan for the scales"
+        return config
+
+    def preprocess_model(self, model: "PreTrainedModel", **kwargs):
+        """
+        Setting model attributes and/or converting model before weights loading. At this point
+        the model should be initialized on the meta device so you can freely manipulate the skeleton
+        of the model in order to replace modules in-place. Make sure to override the abstract method `_process_model_before_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            kwargs (`dict`, *optional*):
+                The keyword arguments that are passed along `_process_model_before_weight_loading`.
+        """
+        model.is_quantized = True
+        model.quantization_method = self.quantization_config.quant_method
+        if self.pre_quantized:
+            self._convert_model_for_quantization(model)
+        return self._process_model_before_weight_loading(model, **kwargs)
+
+    def postprocess_model(self, model: "PreTrainedModel", **kwargs):
+        """
+        Post-process the model post weights loading.
+        Make sure to override the abstract method `_process_model_after_weight_loading`.
+
+        Args:
+            model (`~transformers.PreTrainedModel`):
+                The model to quantize
+            kwargs (`dict`, *optional*):
+                The keyword arguments that are passed along `_process_model_after_weight_loading`.
+        """
+        return self._process_model_after_weight_loading(model, **kwargs)
+
+    def dequantize(self, model):
+        """
+        Potentially dequantize the model to retrive the original model, with some loss in accuracy / performance.
+        Note not all quantization schemes support this.
+        """
+        model = self._dequantize(model)
+
+        # Delete quantizer and quantization config
+        del model.hf_quantizer
+        del model.config.quantization_config
+        del model.config._pre_quantization_dtype
+        model.is_quantized = False
+
+        return model
+
+    def get_cuda_warm_up_factor(self):
+        """
+        The factor to be used in `caching_allocator_warmup` to get the number of bytes to pre-allocate to warm up cuda.
+        A factor of 2 means we allocate all bytes in the empty model (since we allocate in fp16), a factor of 4 means
+        we allocate half the memory of the weights residing in the empty model, etc...
+        """
+        # By default we return 4, i.e. half the model size (this corresponds to the case where the model is not
+        # really pre-processed, i.e. we do not have the info that weights are going to be 8 bits before actual
+        # weight loading)
+        return 4
+
+    def _dequantize(self, model):
+        raise NotImplementedError(
+            f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
+        )
+
+    @staticmethod
+    def get_modules_to_not_convert(
+        model: "PreTrainedModel",
+        skip_modules: Optional[List[str]] = None,
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        add_default_skips: bool = False,
+    ):
+        from ..integrations import get_keys_to_not_convert
+
+        if skip_modules is None or add_default_skips:
+            modules_to_not_convert = get_keys_to_not_convert(model)
+        else:
+            modules_to_not_convert = []
+
+        if skip_modules is not None:
+            modules_to_not_convert.extend(skip_modules)
+
+        if keep_in_fp32_modules is not None:
+            modules_to_not_convert.extend(keep_in_fp32_modules)
+
+        return modules_to_not_convert
+
+    @property
+    def is_qat_trainable(self) -> bool:
+        """Flag indicating whether the quantized model can carry out quantization aware training"""
+        return False
+
+    @property
+    def is_compileable(self) -> bool:
+        """Flag indicating whether the quantized model can be compiled"""
+        return False
+
+    @abstractmethod
+    def _process_model_before_weight_loading(self, model, **kwargs): ...
+
+    @abstractmethod
+    def _process_model_after_weight_loading(self, model, **kwargs): ...
+
+    @abstractmethod
+    def is_serializable(self, safe_serialization=None): ...
+
+    @property
+    @abstractmethod
+    def is_trainable(self): ...
+
+    def _convert_model_for_quantization(self, model):
+        from accelerate import init_empty_weights
+
+        for name, module in model.named_modules():
+            module_class_name = module.__class__.__name__
+            if module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION.keys() and (
+                self.quantization_config.quant_method
+                in MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["quantization_methods"]
+            ):
+                with init_empty_weights():
+                    parent_module, name = get_module_from_name(model, name)
+                    parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["module_name"](
+                        model.config.get_text_config()
+                    )
+
+
+class SequentialLlama4TextExperts(ModuleList):
+    """
+    A module that implements a compressed version of a list of expert modules.
+    This is specifically designed to work with Llama4TextExperts in MoE layers.
+    """
+
+    def __init__(self, config):
+        from transformers.models.llama4.modeling_llama4 import Llama4TextMLP
+
+        super().__init__([Llama4TextMLP(config) for _ in range(config.num_local_experts)])
+        self.num_experts = config.num_local_experts
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+    ) -> "torch.Tensor":
+        hidden_states = hidden_states.reshape(self.num_experts, -1, hidden_states.shape[-1])
+        routed_out = torch.zeros_like(hidden_states)
+        for expert_idx in range(self.num_experts):
+            routed_out[expert_idx] = self[expert_idx](hidden_states[expert_idx])
+        return routed_out
+
+
+MODULES_TO_PATCH_FOR_QUANTIZATION = {
+    "Llama4TextExperts": {
+        "module_name": SequentialLlama4TextExperts,
+        "quantization_methods": [
+            QuantizationMethod.COMPRESSED_TENSORS,
+            QuantizationMethod.BITS_AND_BYTES,
+        ],
+    }
+}
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_aqlm.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_aqlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1d6f7e89f1e9aa2454a1fd56c81524660f9079
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_aqlm.py
@@ -0,0 +1,97 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..integrations import replace_with_aqlm_linear
+from ..utils import is_accelerate_available, is_aqlm_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class AqlmHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the AQLM method. Enables the loading of prequantized models.
+    """
+
+    requires_calibration = True
+    required_packages = ["aqlm"]
+    optimum_quantizer = None
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Using `aqlm` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_aqlm_available():
+            raise ImportError("Using `aqlm` quantization requires AQLM: `pip install aqlm[gpu,cpu]`")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            if torch.cuda.is_available():
+                torch_dtype = torch.float16
+                logger.info(
+                    "CUDA available. Assuming AQLM inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually."
+                )
+            else:
+                torch_dtype = torch.float32
+                logger.info(
+                    "CUDA is unavailable. Assuming AQLM inference on CPU and loading the model in `torch.float32`. To overwrite it, set `torch_dtype` manually."
+                )
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        replace_with_aqlm_linear(
+            model,
+            quantization_config=self.quantization_config,
+            linear_weights_not_to_quantize=self.quantization_config.linear_weights_not_to_quantize,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        aqlm_supports_training = version.parse(importlib.metadata.version("aqlm")) >= version.parse("1.0.2")
+        if aqlm_supports_training:
+            return True
+        else:
+            logger.warning(
+                f"Currently installed `aqlm` version ({importlib.metadata.version('aqlm')}) doesn't support training. If you wish to train a quantized model, please update `aqlm` with `pip install aqlm>=1.0.2`"
+            )
+            return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_auto_round.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_auto_round.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2d95abb64abf2ac7b5a84b21c89ff93e83059b5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_auto_round.py
@@ -0,0 +1,81 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_auto_round_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class AutoRoundQuantizer(HfQuantizer):
+    """
+    Quantizer of the AutoRound method. (https://arxiv.org/pdf/2309.05516)
+    """
+
+    # AutoRound requires data calibration - we support only inference
+    requires_calibration = True
+    required_packages = ["auto_round"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, *args, **kwargs):
+        self.device_map = kwargs.get("device_map", None)
+        if not is_auto_round_available():
+            raise ImportError(
+                "Loading an AutoRound quantized model requires auto-round library (`pip install 'auto-round>=0.5'`)"
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.bfloat16
+            logger.info("Loading the model in `torch.bfloat16`. To overwrite it, set `torch_dtype` manually.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if model.__class__.main_input_name != "input_ids":
+            logger.warning("AutoRound offers only limited support for models that are not strictly text-based.")
+        from auto_round.inference.convert_model import convert_hf_model, infer_target_device
+
+        if self.pre_quantized:
+            target_device = infer_target_device(self.device_map)
+            model, used_backends = convert_hf_model(model, target_device)
+            self.used_backends = used_backends
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if self.pre_quantized:
+            from auto_round.inference.convert_model import post_init
+
+            post_init(model, self.used_backends)
+        else:
+            raise ValueError("AutoRound only sports pre-quantized models.")
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        ## for gptq/awq models, the quantization config will be changed
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_awq.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_awq.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e63e2f5bf68d697022fecba3dfc875c190c3a17
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_awq.py
@@ -0,0 +1,156 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.metadata
+from typing import TYPE_CHECKING, List, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging
+from ..utils.quantization_config import AWQLinearVersion
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class AwqQuantizer(HfQuantizer):
+    """
+    4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
+    """
+
+    # AWQ requires data callibration - we support only inference
+    requires_calibration = True
+
+    required_packages = ["awq", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, device_map, **kwargs):
+        if not is_auto_awq_available():
+            raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)")
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
+
+        if self.quantization_config.version == AWQLinearVersion.GEMM and not torch.cuda.is_available():
+            logger.warning_once("No CUDA found, replace GEMM with IPEX version to support non-cuda AWQ model.")
+            self.quantization_config.version = AWQLinearVersion.IPEX
+
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"):
+                raise RuntimeError(
+                    "To use IPEX backend, you need autoawq>0.2.6. Please install the latest version or from source."
+                )
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'"
+                )
+            elif isinstance(device_map, dict) and "disk" in device_map.values():
+                raise ValueError(
+                    "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device."
+                    " This is not supported. Please make sure only cpu and xpu in the device_map."
+                )
+        else:
+            if not torch.cuda.is_available():
+                raise RuntimeError(
+                    "GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU"
+                )
+
+            if device_map is None:
+                logger.warning_once(
+                    "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
+                    "your model on a GPU device in order to run your model."
+                )
+            elif device_map is not None:
+                if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                    raise ValueError(
+                        "You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
+                        " This is not supported. Please remove the CPU or disk device from the device_map."
+                    )
+
+    def update_torch_dtype(self, torch_dtype):
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
+        elif torch_dtype == torch.bfloat16:
+            logger.warning("`torch.bfloat16` is not supported for AWQ kernels yet. Casting to `torch.float16`.")
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16:
+            logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
+    ):
+        from ..integrations import replace_quantization_scales, replace_with_awq_linear
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules, add_default_skips=True
+        )
+
+        model, has_been_replaced = replace_with_awq_linear(
+            model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert
+        )
+
+        model = replace_quantization_scales(model, model.config.model_type)
+
+        if not has_been_replaced:
+            logger.warning(
+                "You are loading an AWQ model but no linear modules were found in your model."
+                " Please double check your model architecture, or submit an issue on github if you think this is a bug."
+            )
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        if self.quantization_config.do_fuse:
+            from ..integrations import fuse_awq_modules
+
+            model = fuse_awq_modules(model, self.quantization_config)
+            model._awq_is_fused = True  # TODO: consider storing this flag in model.config instead
+
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            from ..integrations import post_init_awq_exllama_modules
+
+            model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config)
+
+        if self.quantization_config.version == AWQLinearVersion.IPEX:
+            from ..integrations import post_init_awq_ipex_modules
+
+            model = post_init_awq_ipex_modules(model)
+
+    def is_serializable(self, safe_serialization=None):
+        # AWQ through auto-awq has been always serializable, except if the model is fused.
+        if self.quantization_config.do_fuse:
+            logger.warning("You cannot save an AWQ model that uses fused modules!")
+            return False
+
+        if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
+            logger.warning("You cannot save an AWQ model that uses Exllama backend!")
+            return False
+
+        return True
+
+    @property
+    def is_trainable(self):
+        # AWQ supports PEFT fine-tuning from version 0.2.0
+        MIN_AWQ_VERSION_FOR_PEFT = "0.2.0"
+        return version.parse(importlib.metadata.version("autoawq")) >= version.parse(MIN_AWQ_VERSION_FOR_PEFT)
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_bitnet.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_bitnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..e56bb161ac449853f0fb12bc86aed1a9c75b88ee
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_bitnet.py
@@ -0,0 +1,113 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_torch_available, logging
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class BitNetHfQuantizer(HfQuantizer):
+    """
+    1.58-bit quantization from BitNet quantization method:
+    Before loading: it converts the linear layers into BitLinear layers during loading.
+
+    Checkout the paper introducing this method : https://arxiv.org/pdf/2402.17764
+    """
+
+    requires_parameters_quantization = False
+    requires_calibration = True
+
+    required_packages = ["accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Loading a BitNet quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Loading ternary weights from tf/flax is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            logger.warning_once(
+                "You don't have a GPU available to load the model, the inference will be slow because of weight unpacking"
+            )
+            return
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded a BitNet model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model."
+            )
+        elif device_map is not None:
+            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                raise ValueError(
+                    "You are attempting to load a BitNet model with a device_map that contains a CPU or disk device."
+                    "This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations import replace_with_bitnet_linear
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        model = replace_with_bitnet_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        target_dtype = torch.int8
+        return target_dtype
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_4bit.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_4bit.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fb9176c4673a009914600ae47068c22826739d2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_4bit.py
@@ -0,0 +1,358 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from functools import cached_property
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    ACCELERATE_MIN_VERSION,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_torch_available,
+    is_torch_hpu_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+    logging,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from ..pytorch_utils import Conv1D
+
+logger = logging.get_logger(__name__)
+
+
+class Bnb4BitHfQuantizer(HfQuantizer):
+    """
+    4-bit quantization from bitsandbytes.py quantization method:
+        before loading: converts transformer layers into Linear4bit during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear4bit into 4bit at the first .cuda() call
+        saving:
+            from state dict, as usual; saves weights and `quant_state` components
+        loading:
+            need to locate `quant_state` components and pass to Param4bit constructor
+    """
+
+    use_keep_in_fp32_modules = True
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["bitsandbytes", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if self.quantization_config.llm_int8_skip_modules is not None:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError(
+                f"Using `bitsandbytes` 4-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+            )
+        if not is_bitsandbytes_available():
+            raise ImportError(
+                "Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
+            )
+
+        from ..integrations import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
+
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
+        validate_bnb_backend_availability(raise_exception=True)
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if (
+            device_map is not None
+            and isinstance(device_map, dict)
+            and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        ):
+            device_map_without_lm_head = {
+                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+            }
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+                raise ValueError(
+                    "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
+                    "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "`from_pretrained`. Check "
+                    "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
+                    "for more details. "
+                )
+
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.39.0"):
+            raise ValueError(
+                "You have a version of `bitsandbytes` that is not compatible with 4bit inference and training"
+                " make sure you have the latest version of `bitsandbytes` installed"
+            )
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
+            from accelerate.utils import CustomDtype
+
+            if target_dtype != torch.int8:
+                logger.info("target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization")
+            return CustomDtype.INT4
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on a 4bit loaded version of the model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library,"
+                "`pip install --upgrade accelerate` or install it from source to support fp4 auto device map"
+                "calculation. You may encounter unexpected behavior, or pass your own device map"
+            )
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Params4bit):
+            # Add here check for loaded components' dtypes once serialization is implemented
+            return True
+        elif isinstance(module, bnb.nn.Linear4bit) and tensor_name == "bias":
+            # bias could be loaded by regular set_module_tensor_to_device() from accelerate,
+            # but it would wrongly use uninitialized weight there.
+            return True
+        else:
+            return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device()
+        """
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        old_value = getattr(module, tensor_name)
+
+        # `torch.Tensor.to(<int num>)` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)).
+        if isinstance(target_device, int) and is_torch_npu_available():
+            target_device = f"npu:{target_device}"
+        if tensor_name == "bias":
+            if param_value is None:
+                new_value = old_value.to(target_device)
+            else:
+                new_value = param_value.to(target_device)
+
+            new_value = torch.nn.Parameter(new_value, requires_grad=old_value.requires_grad)
+            module._parameters[tensor_name] = new_value
+            return
+
+        if not isinstance(module._parameters[tensor_name], bnb.nn.Params4bit):
+            raise ValueError("this function only loads `Linear4bit components`")
+        if (
+            old_value.device == torch.device("meta")
+            and target_device not in ["meta", torch.device("meta")]
+            and param_value is None
+        ):
+            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+        # construct `new_value` for the module._parameters[tensor_name]:
+        if self.pre_quantized:
+            # 4bit loading. Collecting components for restoring quantized weight
+            # This can be expanded to make a universal call for any quantized weight loading
+
+            if not self.is_serializable:
+                raise ValueError(
+                    "Detected int4 weights but the version of bitsandbytes is not compatible with int4 serialization. "
+                    "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+                )
+
+            if (param_name + ".quant_state.bitsandbytes__fp4" not in state_dict) and (
+                param_name + ".quant_state.bitsandbytes__nf4" not in state_dict
+            ):
+                raise ValueError(
+                    f"Supplied state dict for {param_name} does not contain `bitsandbytes__*` and possibly other `quantized_stats` components."
+                )
+
+            quantized_stats = {}
+            for k, v in state_dict.items():
+                if param_name + "." in k:
+                    quantized_stats[k] = v
+                    if unexpected_keys is not None and k in unexpected_keys:
+                        unexpected_keys.remove(k)
+
+            param_kwargs = {}
+            if self.is_bnb_supports_quant_storage_module:
+                param_kwargs["module"] = module
+
+            new_value = bnb.nn.Params4bit.from_prequantized(
+                data=param_value,
+                quantized_stats=quantized_stats,
+                requires_grad=False,
+                device=target_device,
+                **param_kwargs,
+            )
+        else:
+            new_value = param_value.to("cpu")
+
+            # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+            # Since weights are saved in the correct "orientation", we skip transposing when loading.
+            if issubclass(module.source_cls, Conv1D):
+                new_value = new_value.T
+
+            kwargs = old_value.__dict__
+            new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(target_device)
+
+        module._parameters[tensor_name] = new_value
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.adjust_max_memory
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer.update_torch_dtype
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+            torch_dtype = torch.float16
+        return torch_dtype
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_npu_available():
+                device_map = {"": f"npu:{torch.npu.current_device()}"}
+            elif is_torch_hpu_available():
+                device_map = {"": f"hpu:{torch.hpu.current_device()}"}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                f"Setting device_map to {device_map}. "
+                "If you want to use the model for inference, please set device_map ='auto' "
+            )
+        return device_map
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_before_weight_loading
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations import replace_with_bnb_linear
+
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+        )
+
+        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
+        if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+            keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
+                raise ValueError(
+                    "If you want to offload some keys to `cpu` or `disk`, you need to set "
+                    "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
+                    " converted to 8-bit but kept in 32-bit."
+                )
+            self.modules_to_not_convert.extend(keys_on_cpu)
+
+        model = replace_with_bnb_linear(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here
+
+        model.config.quantization_config = self.quantization_config
+
+    # Copied from transformers.quantizers.quantizer_bnb_8bit.Bnb8BitHfQuantizer._process_model_after_weight_loading with 8bit->4bit
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_loaded_in_4bit = True
+        model.is_4bit_serializable = self.is_serializable()
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        _is_4bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.41.3")
+
+        if not _is_4bit_serializable:
+            logger.warning(
+                "You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. "
+                "If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed."
+            )
+            return False
+
+        return True
+
+    @cached_property
+    def is_bnb_supports_quant_storage_module(self) -> bool:
+        """
+        determines if the current version of bitsandbytes supports
+        the `module` parameter in `Params4bit.from_prequantized`
+        :return:
+        """
+        return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.43.3")
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_and_replace
+
+        model = dequantize_and_replace(
+            model, self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        return model
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_8bit.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_8bit.py
new file mode 100644
index 0000000000000000000000000000000000000000..cac339b16b9a099f52cc8715eb0b0ecddcff54fd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_bnb_8bit.py
@@ -0,0 +1,303 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    ACCELERATE_MIN_VERSION,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_torch_available,
+    is_torch_xpu_available,
+    logging,
+)
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+    from ..pytorch_utils import Conv1D
+
+logger = logging.get_logger(__name__)
+
+
+class Bnb8BitHfQuantizer(HfQuantizer):
+    """
+    8-bit quantization from bitsandbytes quantization method:
+        before loading: converts transformer layers into Linear8bitLt during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear8bitLt into 8bit at fitst .cuda() call
+    saving:
+        from state dict, as usual; saves weights and 'SCB' component
+    loading:
+        need to locate SCB component and pass to the Linear8bitLt object
+    """
+
+    use_keep_in_fp32_modules = True
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["bitsandbytes", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if self.quantization_config.llm_int8_skip_modules is not None:
+            self.modules_to_not_convert = self.quantization_config.llm_int8_skip_modules
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError(
+                f"Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
+            )
+        if not is_bitsandbytes_available():
+            raise ImportError(
+                "Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`"
+            )
+
+        from ..integrations import validate_bnb_backend_availability
+        from ..utils import is_bitsandbytes_multi_backend_available
+
+        bnb_multibackend_is_enabled = is_bitsandbytes_multi_backend_available()
+        validate_bnb_backend_availability(raise_exception=True)
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 4-bit or 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if (
+            device_map is not None
+            and isinstance(device_map, dict)
+            and not self.quantization_config.llm_int8_enable_fp32_cpu_offload
+        ):
+            device_map_without_lm_head = {
+                key: device_map[key] for key in device_map.keys() if key not in self.modules_to_not_convert
+            }
+            if set(device_map.values()) == {"cpu"} and bnb_multibackend_is_enabled:
+                pass
+            elif "cpu" in device_map_without_lm_head.values() or "disk" in device_map_without_lm_head.values():
+                raise ValueError(
+                    "Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the "
+                    "quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules "
+                    "in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to "
+                    "`from_pretrained`. Check "
+                    "https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu "
+                    "for more details. "
+                )
+
+        if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.37.2"):
+            raise ValueError(
+                "You have a version of `bitsandbytes` that is not compatible with 8bit inference and training"
+                " make sure you have the latest version of `bitsandbytes` installed"
+            )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for buffers that are created during quantization
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            # We force the `dtype` to be float16, this is a requirement from `bitsandbytes`
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `bitsandbytes` to enable model loading in 8-bit or 4-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+            torch_dtype = torch.float16
+        return torch_dtype
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            if torch.cuda.is_available():
+                device_map = {"": torch.cuda.current_device()}
+            elif is_torch_xpu_available():
+                device_map = {"": f"xpu:{torch.xpu.current_device()}"}
+            else:
+                device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                f"Setting device_map to {device_map}. "
+                "If you want to use the model for inference, please set device_map ='auto' "
+            )
+        return device_map
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if target_dtype != torch.int8:
+            logger.info("target_dtype {target_dtype} is replaced by `torch.int8` for 8-bit BnB quantization")
+        return torch.int8
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        import bitsandbytes as bnb
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module._parameters.get(tensor_name, None), bnb.nn.Int8Params):
+            if self.pre_quantized:
+                if param_name.replace("weight", "SCB") not in state_dict.keys():
+                    raise ValueError("Missing quantization component `SCB`")
+                if param_value.dtype != torch.int8:
+                    raise ValueError(
+                        f"Incompatible dtype `{param_value.dtype}` when loading 8-bit prequantized weight. Expected `torch.int8`."
+                    )
+            return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        combines logic from _load_state_dict_into_meta_model and .integrations.bitsandbytes.py::set_module_quantized_tensor_to_device()
+        needs aux items from state dicts, if found - removes them from unexpected_keys
+        """
+        import bitsandbytes as bnb
+
+        fp16_statistics_key = param_name.replace("weight", "SCB")
+        fp16_weights_format_key = param_name.replace("weight", "weight_format")
+
+        fp16_statistics = state_dict.get(fp16_statistics_key, None)
+        fp16_weights_format = state_dict.get(fp16_weights_format_key, None)
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if tensor_name not in module._parameters:
+            raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
+
+        old_value = getattr(module, tensor_name)
+
+        if not isinstance(module._parameters[tensor_name], bnb.nn.Int8Params):
+            raise ValueError(f"Parameter `{tensor_name}` should only be a `bnb.nn.Int8Params` instance.")
+        if (
+            old_value.device == torch.device("meta")
+            and target_device not in ["meta", torch.device("meta")]
+            and param_value is None
+        ):
+            raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {target_device}.")
+
+        new_value = param_value.to("cpu")
+        if self.pre_quantized and not self.is_serializable():
+            raise ValueError(
+                "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. "
+                "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`."
+            )
+
+        # Support models using `Conv1D` in place of `nn.Linear` (e.g. openai-community/gpt2) by transposing the weight matrix prior to quantization.
+        # Since weights are saved in the correct "orientation", we skip transposing when loading.
+        if issubclass(module.source_cls, Conv1D):
+            if fp16_statistics is None:
+                new_value = new_value.T
+
+        kwargs = old_value.__dict__
+        new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(target_device)
+
+        module._parameters[tensor_name] = new_value
+        if fp16_statistics is not None:
+            setattr(module.weight, "SCB", fp16_statistics.to(target_device))
+            if unexpected_keys is not None:
+                unexpected_keys.remove(fp16_statistics_key)
+
+        # We just need to pop the `weight_format` keys from the state dict to remove unneeded
+        # messages. The correct format is correctly retrieved during the first forward pass.
+        if fp16_weights_format is not None and unexpected_keys is not None:
+            unexpected_keys.remove(fp16_weights_format_key)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_loaded_in_8bit = True
+        model.is_8bit_serializable = self.is_serializable()
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        device_map,
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations import replace_with_bnb_linear
+
+        llm_int8_enable_fp32_cpu_offload = self.quantization_config.llm_int8_enable_fp32_cpu_offload
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.llm_int8_skip_modules, keep_in_fp32_modules
+        )
+
+        # Extend `self.modules_to_not_convert` to keys that are supposed to be offloaded to `cpu` or `disk`
+        if isinstance(device_map, dict) and len(device_map.keys()) > 1:
+            keys_on_cpu = [key for key, value in device_map.items() if value in ["disk", "cpu"]]
+
+            if len(keys_on_cpu) > 0 and not llm_int8_enable_fp32_cpu_offload:
+                raise ValueError(
+                    "If you want to offload some keys to `cpu` or `disk`, you need to set "
+                    "`llm_int8_enable_fp32_cpu_offload=True`. Note that these modules will not be "
+                    " converted to 8-bit but kept in 32-bit."
+                )
+            self.modules_to_not_convert.extend(keys_on_cpu)
+
+        model = replace_with_bnb_linear(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here
+
+        model.config.quantization_config = self.quantization_config
+
+    def is_serializable(self, safe_serialization=None):
+        _bnb_supports_8bit_serialization = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse(
+            "0.37.2"
+        )
+
+        if not _bnb_supports_8bit_serialization:
+            logger.warning(
+                "You are calling `save_pretrained` to a 8-bit converted model, but your `bitsandbytes` version doesn't support it. "
+                "If you want to save 8-bit models, make sure to have `bitsandbytes>0.37.2` installed. You will most likely face errors or"
+                " unexpected behaviours."
+            )
+            return False
+
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse("0.37.0")
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_and_replace
+
+        model = dequantize_and_replace(
+            model, self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        return model
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_compressed_tensors.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_compressed_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e45abf95340b51138983a3a7132ab2f98b295de
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_compressed_tensors.py
@@ -0,0 +1,173 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import re
+from typing import List
+
+from ..utils import is_compressed_tensors_available, is_torch_available, logging
+from ..utils.quantization_config import CompressedTensorsConfig
+from .base import HfQuantizer
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class CompressedTensorsHfQuantizer(HfQuantizer):
+    """
+    Quantizer for the compressed_tensors package.  Loads and restores models to
+    quantized state with compressed_tensors
+    """
+
+    requires_calibration = True
+    required_packages = ["compressed_tensors"]
+
+    def __init__(self, quantization_config: CompressedTensorsConfig, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if not is_compressed_tensors_available():
+            raise ImportError(
+                "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+                "`pip install compressed-tensors`"
+            )
+
+        # Call post_init here to ensure proper config setup when `run_compressed`
+        # is provided directly via CompressedTensorsConfig, and to avoid duplicate logging.
+
+        quantization_config.post_init()
+        from compressed_tensors.compressors import ModelCompressor
+
+        self.compressor = ModelCompressor.from_compression_config(quantization_config)
+        self.run_compressed = quantization_config.run_compressed
+        self.quantization_config = quantization_config
+
+    def update_missing_keys_after_loading(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        """
+        Update missing keys after loading the model. This is necessary for compressed tensors
+        to load the model correctly. We expect weights to be present in missing keys.
+        The weight's are re-constructed by ModelCompressor in _process_model_after_weight_loading
+
+        This function cleans up expected missing keys and returns the remaining missing keys
+        """
+
+        if self.run_compressed:
+            return missing_keys
+
+        # We expect some keys to be missing for
+        # compresed models
+        # This is fine as the weights are reconstructed by ModelCompressor
+        # in _process_model_after_weight_loading
+
+        expected_missing_keys = self.compressor.get_missing_module_keys(model)
+        return [
+            key for key in missing_keys if not any(re.match(f".*{pattern}", key) for pattern in expected_missing_keys)
+        ]
+
+    def update_unexpected_keys(self, model, unexpected_keys: List[str], prefix: str) -> List[str]:
+        """
+        Override this method if you want to adjust the `unexpected_keys`.
+
+        Args:
+            unexpected_keys (`List[str]`, *optional*):
+                The list of unexpected keys in the checkpoint compared to the state dict of the model
+        """
+
+        if self.run_compressed:
+            return unexpected_keys
+
+        # We expect some unexpected keys in model
+        # safetensors file for compressed models
+        keys_to_ignore = self.compressor.get_unexpected_file_keys(model)
+        return [key for key in unexpected_keys if not any(re.match(f".*{pattern}", key) for pattern in keys_to_ignore)]
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_compressed_tensors_available():
+            raise ImportError(
+                "Using `compressed_tensors` quantized models requires the compressed-tensors library: "
+                "`pip install compressed-tensors`"
+            )
+        if not is_torch_available():
+            # torch already should be installed as part of compressed tensors
+            raise ImportError("torch is required for using compressed-tensors quantization")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("Loading model using torch.float16 for compressed-tensors quantization")
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16:
+            logger.info(
+                "We suggest you to set `torch_dtype=torch.float16` for better efficiency with compressed_tensors."
+            )
+        return torch_dtype
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        from compressed_tensors.quantization import apply_quantization_config
+
+        ct_quantization_config = self.compressor.quantization_config
+
+        if self.run_compressed:
+            apply_quantization_config(model, ct_quantization_config, run_compressed=True)
+        elif not self.quantization_config.is_quantization_compressed:
+            apply_quantization_config(model, ct_quantization_config)
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        """Decompress loaded model if necessary - need for qat"""
+
+        if (
+            self.quantization_config.is_quantization_compressed and not self.run_compressed
+        ) or self.quantization_config.is_sparsification_compressed:
+            config = kwargs.get("config", None)
+            cache_path = config._name_or_path
+
+            if not os.path.exists(cache_path):
+                from transformers.utils import cached_file
+
+                config_file_path = cached_file(cache_path, "config.json")
+                cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+
+            if self.quantization_config.is_quantization_compressed and not self.run_compressed:
+                from compressed_tensors.quantization import QuantizationStatus
+
+                self.compressor.quantization_config.quantization_status = QuantizationStatus.FROZEN
+            self.compressor.decompress(model_path=cache_path, model=model)
+
+    def update_tp_plan(self, config):
+        additional_plan = {
+            "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
+            "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
+            "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+        }
+        if config.get_text_config() is not None and config.get_text_config().base_model_tp_plan is not None:
+            config.get_text_config().base_model_tp_plan.update(additional_plan)
+
+        return config
+
+    @property
+    def is_trainable(self):
+        return True
+
+    def is_qat_trainable(self) -> bool:
+        """Loaded Models can carry out quantization aware training"""
+        # models need to be decompressed carry out qat
+        return not self.run_compressed or not self.quantization_config.is_quantization_compressed
+
+    def is_serializable(self, safe_serialization=None) -> bool:
+        """Models quantized using compressed tensors can be saved to disk"""
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_eetq.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_eetq.py
new file mode 100644
index 0000000000000000000000000000000000000000..988f90789ac3751ca27a9ce4d1ed71762fd7c434
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_eetq.py
@@ -0,0 +1,181 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_eetq_available, is_torch_available, logging
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class EetqHfQuantizer(HfQuantizer):
+    """
+    8-bit quantization from EETQ quantization method:
+        before loading: converts transformer layers into W8A16Linear during loading: load 16bit weight and pass to the
+        layer object after: quantizes individual weights in Linear8bitLt into 8bit at first .cuda() call
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["eetq", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_eetq_available():
+            raise ImportError(
+                "Using `eetq` 8-bit quantization requires eetq."
+                "Please install the latest version of eetq from : https://github.com/NetEase-FuXi/EETQ"
+            )
+
+        try:
+            import eetq  # noqa: F401
+        except ImportError as exc:
+            if "shard_checkpoint" in str(exc):
+                # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed
+                # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34.
+                # TODO: Update message once eetq releases a fix
+                raise ImportError(
+                    "You are using a version of EETQ that is incompatible with the current transformers version. "
+                    "Either downgrade transformers to <= v4.46.3 or, if available, upgrade EETQ to > v1.0.0."
+                ) from exc
+            else:
+                raise
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an EETQ quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into 8-bit weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an EETQ model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model."
+            )
+        elif device_map is not None:
+            if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+                raise ValueError(
+                    "You are attempting to load an EETQ model with a device_map that contains a CPU or disk device."
+                    " This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.float16` due to "
+                "requirements of `eetq` to enable model loading in 8-bit. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.float16 to remove this warning.",
+                torch_dtype,
+            )
+        elif torch_dtype != torch.float16:
+            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with EETQ.")
+        return torch_dtype
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from eetq import EetqLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, EetqLinear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.int8:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        quantizes weights into qweight and weight_scales
+        """
+        from eetq import quantize_and_preprocess_weights
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        new_value, weight_scale = quantize_and_preprocess_weights(param_value)
+
+        module._buffers[tensor_name] = new_value.to(target_device)
+        module.register("weight_scales", weight_scale.to(target_device))
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations import replace_with_eetq_linear
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        model = replace_with_eetq_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_fbgemm_fp8.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_fbgemm_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..7015d73be1487f4a244050f2d2be0b114ddd2ab8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_fbgemm_fp8.py
@@ -0,0 +1,296 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+class FbgemmFp8HfQuantizer(HfQuantizer):
+    """
+    FP8 quantization using fbgemm kernels
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    required_packages = ["fbgemm-gpu", "accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torch_available():
+            raise ImportError(
+                "Using fbgemm fp8 quantization requires torch >= 2.1.0"
+                "Please install the latest version of torch ( pip install --upgrade torch )"
+            )
+        if not is_fbgemm_gpu_available():
+            raise ImportError(
+                "Using fbgemm fp8 quantization requires fbgemm-gpu library"
+                "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries"
+            )
+
+        if not is_accelerate_available("0.32.2"):
+            raise ImportError(
+                "Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)"
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("Using FP8 quantized models with fbgemm kernels requires a GPU")
+
+        compute_capability = torch.cuda.get_device_capability()
+        major, minor = compute_capability
+        if major < 9:
+            raise ValueError(
+                "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)"
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. "
+            )
+        elif device_map is not None:
+            if (
+                not self.pre_quantized
+                and isinstance(device_map, dict)
+                and ("cpu" in device_map.values() or "disk" in device_map.values())
+            ):
+                raise ValueError(
+                    "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device."
+                    "This is not supported when the model is quantized on the fly. "
+                    "Please use a quantized checkpoint or remove the CPU or disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.bfloat16
+            logger.info(
+                "Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to "
+                "requirements of `fbgemm-gpu` to enable model loading in fp8. "
+                "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass"
+                " torch_dtype=torch.bfloat16 to remove this warning.",
+                torch_dtype,
+            )
+        elif torch_dtype == torch.float16:
+            raise ValueError(
+                "You cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16"
+            )
+        return torch_dtype
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from ..integrations import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, FbgemmFp8Linear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        if isinstance(module, FbgemmFp8Llama4TextExperts):
+            if self.pre_quantized or tensor_name == "bias":
+                return False
+            else:
+                if tensor_name == "gate_up_proj_scale" or tensor_name == "down_proj_scale":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        Quantizes weights into weight and weight_scale
+        """
+
+        from ..integrations import FbgemmFp8Llama4TextExperts
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module, FbgemmFp8Llama4TextExperts):
+            if tensor_name == "gate_up_proj":
+                # Process each expert separately
+                # Transpose the second and third dimension
+                transposed_param = param_value.transpose(1, 2)
+
+                # Reshape to 2D for quantization
+                original_shape = transposed_param.shape
+                flattened_param = transposed_param.reshape(-1, original_shape[-1])
+
+                # Quantize using per row instead of per column
+                new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+
+                # Reshape back to original dimensions
+                new_value = new_value_flat.reshape(original_shape)
+                new_value = new_value.transpose(1, 2)
+                weight_scale = weight_scale_flat.reshape(original_shape[0], 1, original_shape[1])
+            elif tensor_name == "down_proj":
+                # Process each expert separately
+                # Transpose the weights for proper quantization
+                transposed_param = param_value.transpose(1, 2)
+
+                # Reshape to 2D for quantization
+                original_shape = transposed_param.shape
+                flattened_param = transposed_param.reshape(-1, original_shape[-1])
+
+                # Quantize using per column
+                new_value_flat, weight_scale_flat = torch.ops.fbgemm.quantize_fp8_per_row(flattened_param)
+
+                # Reshape back to original dimensions
+                new_value = new_value_flat.reshape(original_shape)
+                new_value = new_value.transpose(1, 2)
+                weight_scale = weight_scale_flat.reshape(original_shape[0], original_shape[1], 1)
+
+            module._parameters[f"{tensor_name}_scale"] = torch.nn.Parameter(weight_scale.to(target_device))
+        else:
+            new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value)
+            module._parameters[f"{tensor_name}_scale"] = torch.nn.Parameter(
+                weight_scale.view(weight_scale.shape[0], 1).to(target_device)
+            )
+
+        module._parameters[tensor_name] = torch.nn.Parameter(new_value.to(target_device))
+
+        if unexpected_keys is not None and param_name in unexpected_keys:
+            unexpected_keys.remove(param_name)
+        del param_name
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations import replace_with_fbgemm_fp8_linear
+
+        tp_plan = model._tp_plan
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        config = model.config
+        model = replace_with_fbgemm_fp8_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+            pre_quantized=self.pre_quantized,
+            config=config,
+            tp_plan=tp_plan,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from ..integrations import FbgemmFp8Linear, FbgemmFp8Llama4TextExperts
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, FbgemmFp8Linear) or isinstance(module, FbgemmFp8Llama4TextExperts):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def update_tp_plan(self, config):
+        if "Llama4" in config.__class__.__name__:
+            text_plan = {
+                # We are using a different tp plan with local_colwise and local_rowwise for the attention because fbgemm operations cannot be parallelized
+                # With local_colwise and local_rowwise, all the operations are done locally, and we add a gather operation to gather the results instead of
+                # using dtensors
+                "layers.*.self_attn.q_proj.weight": "local_colwise",
+                "layers.*.self_attn.q_proj.weight_scale": "local_colwise",
+                "layers.*.self_attn.k_proj.weight": "local_colwise",
+                "layers.*.self_attn.k_proj.weight_scale": "local_colwise",
+                "layers.*.self_attn.v_proj.weight": "local_colwise",
+                "layers.*.self_attn.v_proj.weight_scale": "local_colwise",
+                "layers.*.self_attn.o_proj.weight": "local_rowwise",
+                "layers.*.self_attn": "gather",
+                # We keep the same sequence_parallel plan for layernorms
+                "layers.*.input_layernorm.weight": "sequence_parallel",
+                "layers.*.post_attention_layernorm.weight": "sequence_parallel",
+                "norm.weight": "sequence_parallel",
+                # We keep the same local_colwise and local_rowwise plan for the feed forward shared expert
+                # We also add scales for the shared expert, for local_colwise the scale is also local_colwise
+                # For local_rowwise the scale is replicated, so we don't need to add it
+                "layers.*.feed_forward.shared_expert.gate_proj.weight": "local_colwise",
+                "layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.shared_expert.up_proj.weight": "local_colwise",
+                "layers.*.feed_forward.shared_expert.up_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.shared_expert.down_proj.weight": "local_rowwise",
+                "layers.*.feed_forward.experts": "local",
+                "layers.*.feed_forward": "gather",
+                "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
+                "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
+                "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+                # For Fused implementation we use local_packed_rowwise for the gate_up_proj, and the same for the packed scales
+                # We use local_colwise for the down_proj, and the scales are replicated so we don't add them
+                "layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",
+                "layers.*.feed_forward.experts.gate_up_proj_scale": "local_packed_rowwise",
+                "layers.*.feed_forward.experts.down_proj": "local_colwise",
+            }
+            if config.get_text_config() is not None:
+                config.get_text_config().base_model_tp_plan = text_plan
+            else:
+                config.base_model_tp_plan = text_plan
+            return config
+
+        return config
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_finegrained_fp8.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_finegrained_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..76f6f9221c3c6868da04735bed57ddd87b2460c2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -0,0 +1,206 @@
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from ..utils import is_accelerate_available, is_torch_available, logging
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if is_torch_available():
+    import torch
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+
+class FineGrainedFP8HfQuantizer(HfQuantizer):
+    """
+    FP8 quantization implementation supporting both standard and MoE models.
+    Supports both e4m3fn formats based on platform.
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["accelerate"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torch_available():
+            raise ImportError(
+                "Using fp8 quantization requires torch >= 2.1.0"
+                "Please install the latest version of torch ( pip install --upgrade torch )"
+            )
+
+        if not is_accelerate_available():
+            raise ImportError("Loading an FP8 quantized model requires accelerate (`pip install accelerate`)")
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting into FP8 weights from tf/flax weights is currently not supported, "
+                "please make sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for FP8 quantization.")
+
+        compute_capability = torch.cuda.get_device_capability()
+        major, minor = compute_capability
+        if (major < 8) or (major == 8 and minor < 9):
+            raise ValueError(
+                "FP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100)"
+                f", actual = `{major}.{minor}`"
+            )
+
+        device_map = kwargs.get("device_map", None)
+        if device_map is None:
+            logger.warning_once(
+                "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set "
+                "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. "
+            )
+        elif device_map is not None:
+            if (
+                not self.pre_quantized
+                and isinstance(device_map, dict)
+                and ("cpu" in device_map.values() or "disk" in device_map.values())
+            ):
+                raise ValueError(
+                    "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
+                    "This is not supported when the model is quantized on the fly. "
+                    "Please use a quantized checkpoint or remove the cpu/disk device from the device_map."
+                )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("Setting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained")
+            torch_dtype = torch.float32
+        return torch_dtype
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        """
+        Quantizes weights to FP8 format using Block-wise quantization
+        """
+        from ..modeling_utils import _load_parameter_into_model
+
+        param_value = param_value.to(target_device)
+
+        # Get FP8 min/max values
+        fp8_min = torch.finfo(torch.float8_e4m3fn).min
+        fp8_max = torch.finfo(torch.float8_e4m3fn).max
+
+        block_size_m, block_size_n = self.quantization_config.weight_block_size
+
+        rows, cols = param_value.shape[-2:]
+
+        if rows % block_size_m != 0 or cols % block_size_n != 0:
+            raise ValueError(
+                f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})"
+            )
+        param_value_orig_shape = param_value.shape
+
+        param_value = param_value.reshape(
+            -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
+        ).permute(0, 1, 3, 2, 4)
+
+        # Calculate scaling factor for each block
+        max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
+        scale = fp8_max / max_abs
+        scale_orig_shape = scale.shape
+        scale = scale.unsqueeze(-1).unsqueeze(-1)
+
+        # Quantize the weights
+        quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
+        # Reshape back to matrix shape
+        quantized_param = quantized_param.reshape(param_value_orig_shape)
+
+        # Reshape scale to match the number of blocks
+        scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
+
+        # Load into the model
+        _load_parameter_into_model(model, param_name, quantized_param)
+        _load_parameter_into_model(model, param_name.rsplit(".", 1)[0] + ".weight_scale_inv", scale)
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ):
+        from ..integrations.finegrained_fp8 import FP8Linear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if isinstance(module, FP8Linear):
+            if self.pre_quantized or tensor_name == "bias":
+                if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
+                    raise ValueError("Expect quantized weights but got an unquantized weight")
+                return False
+            else:
+                if tensor_name == "weight_scale_inv":
+                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
+                return True
+        return False
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        from ..integrations.finegrained_fp8 import replace_with_fp8_linear
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        model = replace_with_fp8_linear(
+            model,
+            modules_to_not_convert=self.modules_to_not_convert,
+            quantization_config=self.quantization_config,
+        )
+
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from ..integrations import FP8Linear
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, FP8Linear):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return False
+
+    def get_cuda_warm_up_factor(self):
+        # Pre-processing is done cleanly, so we can allocate everything here
+        return 2
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_gptq.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fc35bea1635edd957566f53005ff65a58d57282
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_gptq.py
@@ -0,0 +1,124 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Optional
+
+from packaging import version
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging
+from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class GptqHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
+    `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model.
+    """
+
+    requires_calibration = False
+    required_packages = ["optimum", "auto_gptq", "gptqmodel"]
+    optimum_quantizer = None
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        if not is_optimum_available():
+            raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+        from optimum.gptq import GPTQQuantizer
+
+        self.optimum_quantizer = GPTQQuantizer.from_dict(self.quantization_config.to_dict_optimum())
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_optimum_available():
+            raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
+        if is_auto_gptq_available() and is_gptqmodel_available():
+            logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel")
+
+        gptq_supports_cpu = (
+            is_auto_gptq_available()
+            and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        ) or is_gptqmodel_available()
+        if not gptq_supports_cpu and not torch.cuda.is_available():
+            raise RuntimeError("GPU is required to quantize or run quantize model.")
+        elif not (is_auto_gptq_available() or is_gptqmodel_available()):
+            raise ImportError(
+                "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. "
+            )
+        elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse(
+            "0.4.2"
+        ):
+            raise ImportError(
+                "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`."
+            )
+        elif is_gptqmodel_available() and (
+            version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
+            or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
+        ):
+            raise ImportError("The gptqmodel version should be >= 1.4.3, optimum version should >= 1.24.0")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info("Loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually.")
+        elif torch_dtype != torch.float16:
+            logger.info("We suggest you to set `torch_dtype=torch.float16` for better efficiency with GPTQ.")
+        return torch_dtype
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            device_map = {"": torch.device("cpu")}
+        # Only with auto-gptq do not support CPU, we should move the model to cuda if available.
+        if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}):
+            device_map == {"": 0}
+        return device_map
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if model.__class__.main_input_name != "input_ids":
+            raise RuntimeError("We can only quantize pure text model.")
+
+        if self.pre_quantized:
+            # compat: latest optimum has gptqmodel refactor
+            if version.parse(importlib.metadata.version("optimum")) <= version.parse("1.23.99"):
+                model = self.optimum_quantizer.convert_model(model)
+            else:
+                model = self.optimum_quantizer.convert_model(model, **kwargs)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        if self.pre_quantized:
+            model = self.optimum_quantizer.post_init_model(model)
+        else:
+            if self.quantization_config.tokenizer is None:
+                self.quantization_config.tokenizer = model.name_or_path
+
+            self.optimum_quantizer.quantize_model(model, self.quantization_config.tokenizer)
+            model.config.quantization_config = GPTQConfig.from_dict(self.optimum_quantizer.to_dict())
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return True
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_higgs.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_higgs.py
new file mode 100644
index 0000000000000000000000000000000000000000..83c102f16c3e303514d856508e4dc37d87163ffb
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_higgs.py
@@ -0,0 +1,202 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+from ..utils.logging import tqdm
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class HiggsHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the HIGGS method. Enables the loading of prequantized models and in-flight quantization of full-precision models.
+    """
+
+    requires_calibration = False
+    requires_parameters_quantization = True
+    required_packages = ["flute-kernel", "fast_hadamard_transform"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, device_map, **kwargs):
+        if not torch.cuda.is_available():
+            raise NotImplementedError("HIGGS quantization is only supported on GPU. Please use a different quantizer.")
+
+        if not is_accelerate_available():
+            raise ImportError("Using `higgs` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_flute_available():
+            raise ImportError("Using `higgs` quantization requires FLUTE: `pip install flute-kernel>=0.3.0`")
+
+        if not is_hadamard_available():
+            raise ImportError(
+                "Using `higgs` quantization requires fast_hadamard_transform: `pip install fast_hadamard_transform`"
+            )
+
+        if device_map is None:
+            raise ValueError(
+                "You are attempting to load a HIGGS model without setting device_map."
+                " Please set device_map comprised of 'cuda' devices."
+            )
+        elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
+            raise ValueError(
+                "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
+                " This is not supported. Please remove the CPU or disk device from the device_map."
+            )
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("`torch_dtype` is None. Setting `torch_dtype=torch.float16` for FLUTE compatibility.")
+            torch_dtype = torch.float16
+        elif torch_dtype != torch.float16 and torch_dtype != torch.bfloat16:
+            raise ValueError(
+                f"Invalid `torch_dtype` {torch_dtype}. HIGGS quantization only supports `torch_dtype=torch.float16` or `torch_dtype=torch.bfloat16`."
+            )
+
+        return torch_dtype
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: Optional[List[str]] = None,
+    ):
+        from ..integrations import quantize_with_higgs
+
+        """
+        Quantizes weights into weight and weight_scale
+        """
+        flute_dict = quantize_with_higgs(
+            param_value.to(target_device),
+            self.quantization_config.bits,
+            self.quantization_config.p,
+            self.quantization_config.group_size,
+            self.quantization_config.hadamard_size,
+        )
+        del param_value
+
+        module, _ = get_module_from_name(model, param_name)
+        module_name = ".".join(param_name.split(".")[:-1])
+        for key, value in flute_dict.items():
+            if key in module._parameters:
+                module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
+            elif key in module._buffers:
+                module._buffers[key] = torch.nn.Buffer(value)
+            elif key == "tune_metadata":
+                module.tune_metadata = value
+                self.quantization_config.tune_metadata[module_name] = value.to_dict()
+            else:
+                raise ValueError(f"Unexpected key {key} in module {module}")
+
+        if unexpected_keys is not None and param_name in unexpected_keys:
+            unexpected_keys.remove(param_name)
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        from ..integrations import replace_with_higgs_linear
+
+        replace_with_higgs_linear(
+            model,
+            quantization_config=self.quantization_config,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        from flute.tune import TuneMetaData, maybe_tune_and_repack
+        from flute.utils import make_workspace_streamk
+
+        from ..integrations import HiggsLinear
+
+        flute_workspaces = {}
+        flute_modules = {name: module for name, module in model.named_modules() if isinstance(module, HiggsLinear)}
+        for name, module in tqdm(flute_modules.items(), desc="Repacking HIGGS modules", leave=False):
+            # Every HiggsLinear needs a "workspace": a buffer for the unpacking operation.
+            # This buffer needs to be on the same device as the weights, but can be reused across modules otherwise.
+            if module.weight.device not in flute_workspaces:
+                flute_workspaces[module.weight.device] = make_workspace_streamk(device=module.weight.device)
+            module.workspace = flute_workspaces[module.weight.device]
+
+            # FLUTE weights are packed in a way that is optimized for a specific number of SMs (GPU streaming multiprocessors).
+            # If the model is loaded on a different device than the one it was saved on, we need to repack the weights.
+            module.tune_metadata = TuneMetaData.from_dict(self.quantization_config.tune_metadata[name])
+            module.weight.data, module.tune_metadata = maybe_tune_and_repack(
+                weight=module.weight.data,
+                scales=module.scales.data,
+                metadata=module.tune_metadata,
+            )
+            self.quantization_config.tune_metadata[name] = module.tune_metadata.to_dict()
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        from ..integrations import HiggsLinear
+
+        higgs_names = {name for name, module in model.named_modules() if isinstance(module, HiggsLinear)}
+
+        def should_update(key: str) -> bool:
+            if key.endswith(".weight") or key.endswith(".bias"):
+                return False
+            full_key = f"{prefix}.{key}"
+            return any(name in key or name in full_key for name in higgs_names)
+
+        return [key for key in missing_keys if not should_update(key)]
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        from ..integrations import HiggsLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if isinstance(module, HiggsLinear) and tensor_name == "weight" and param_value.dtype != torch.int16:
+            # Only quantize weights of HiggsLinear modules that are not already quantized
+            return True
+        else:
+            return False
+
+    def _dequantize(self, model):
+        from ..integrations import dequantize_higgs
+
+        model = dequantize_higgs(model)
+        return model
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_hqq.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_hqq.py
new file mode 100644
index 0000000000000000000000000000000000000000..38d8a15cbfcf2c6a2ca031b0cc088b00f1727998
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_hqq.py
@@ -0,0 +1,319 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict, List
+
+from ..integrations import prepare_for_hqq_linear
+from ..utils import is_accelerate_available, is_hqq_available, is_torch_available, logging
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+
+if is_accelerate_available():
+    from accelerate.hooks import remove_hook_from_module
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+# Finds the parent of a node module named "name"
+def find_parent(model, name):
+    module_tree = name.split(".")[:-1]
+    parent = model
+    for m in module_tree:
+        parent = parent._modules[m]
+    return parent
+
+
+class HqqHfQuantizer(HfQuantizer):
+    """
+    HQQ quantizer base HF class.
+    nn.Linear modules are first tagged with quant_config in _process_model_before_weight_loading().
+    The actual quantization and offloading to the GPU is done in check_quantized_param().
+    """
+
+    use_keep_in_fp32_modules = False
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["hqq"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.torch_dtype = None
+        self.using_multi_gpu = False
+
+    def validate_environment(self, *args, **kwargs):
+        if not (is_hqq_available()):
+            raise ImportError(
+                "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`."
+            )
+
+        if kwargs.get("from_tf", False) or kwargs.get("from_flax", False):
+            raise ValueError(
+                "Converting weights from tf/flax weights is currently not supported, please make"
+                " sure the weights are in PyTorch format."
+            )
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+
+        if self.torch_dtype is None:
+            if "torch_dtype" in kwargs:
+                self.torch_dtype = kwargs["torch_dtype"]
+            else:
+                self.torch_dtype = torch.float32
+                logger.info("Setting torch_dtype to torch.float32 as the default value since it was not specified.")
+
+        device_map = kwargs.get("device_map", None)
+        if isinstance(device_map, dict):
+            if "cpu" in device_map.values() or "disk" in device_map.values():
+                raise ValueError(
+                    "You are attempting to use an HQQ model with a device_map that contains a CPU or disk device."
+                    " This is not supported. Please remove the CPU or disk device from the device_map."
+                )
+            else:
+                self.using_multi_gpu = len(set(device_map.values())) > 1
+
+    def update_missing_keys(
+        self, model: "PreTrainedModel", missing_keys: List[str], prefix: str, **kwargs
+    ) -> List[str]:
+        if self.pre_quantized:
+            return [key for key in missing_keys if ("weight" not in key)]
+        else:
+            return missing_keys
+
+    # Adds missing keys for HQQLinear modules that are loaded but the model with initialized with torch.nn.Linear
+    def update_expected_keys(
+        self, model: "PreTrainedModel", expected_keys: List[str], loaded_keys: List[str]
+    ) -> List[str]:
+        if not self.pre_quantized:
+            return expected_keys
+
+        # Collects all quantizable (linear) layers
+        def _find_hqq_quantizable_layers(model, layers):
+            for name, module in model.named_children():
+                if isinstance(module, (torch.nn.Linear)):
+                    layers.add(module.name)
+                _find_hqq_quantizable_layers(module, layers)
+
+        new_keys = set(expected_keys)
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+
+            # Name modules
+            for name, module in model.named_modules():
+                module.name = name
+
+            # valid modules are Linear layers that have HQQLinear state_dict. We ignore skip_modules and any layers with Linear state_dict() params
+            _valid_modules = set()
+            _find_hqq_quantizable_layers(model, _valid_modules)
+
+            # Remove skipped modules
+            _skipped_modules = set()
+            for _module in _valid_modules:
+                for _skip_module in model.config.quantization_config["skip_modules"]:
+                    if _skip_module in _module:
+                        _skipped_modules.add(_module)
+            _valid_modules -= _skipped_modules
+
+            # Append new expected layers based on _ref_keys
+            _ref_keys = HQQLinear(
+                linear_layer=None, quant_config=None, compute_dtype=torch.float16, device="cpu"
+            ).state_dict_keys() - {"bias"}
+
+            # Clean-up
+            _rm_keys = set()
+            for key in new_keys:
+                if any(_module in key for _module in _valid_modules):
+                    _rm_keys.add(key)
+            new_keys -= _rm_keys
+            # At this point, new_keys contains all the keys of the layers that are NOT HQQLinear or torch.nn.Linear
+
+            # Re-populate Linear/HQQLinear
+            for _module in _valid_modules:
+                if _module + ".weight" in loaded_keys:
+                    new_keys.add(_module + ".weight")
+                else:
+                    new_keys.update({_module + "." + _ref_key for _ref_key in _ref_keys})
+                if _module + ".bias" in loaded_keys:
+                    new_keys.add(_module + ".bias")
+
+        return list(new_keys)
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+        module, tensor_name = get_module_from_name(model, param_name)
+
+        if self.pre_quantized:
+            return (isinstance(module, torch.nn.Linear) or isinstance(module, HQQLinear)) and tensor_name != "weight"
+        else:
+            return (
+                isinstance(module, torch.nn.Linear)
+                and tensor_name == "weight"
+                # bias doesn't need to be quantized, we use this as a workaround to avoid loading bias into HQQLinear assuming it was loaded
+                # in the state_dict directly with the weight because hqq overwrote load_state_dict for this layer
+                or (isinstance(module, HQQLinear) and tensor_name == "bias")
+            )
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: List[str],
+    ):
+        """
+        Each nn.Linear layer is processed here.
+        We first check if the corresponding module state_dict contains already HQQ quantized parameters.
+        If not, we create a temp linear layer with the module state_dict params and use it for quantization
+        """
+
+        if is_hqq_available():
+            from hqq.core.quantize import HQQLinear
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        layer_name = ".".join(param_name.split(".")[:-1])
+        parent_module = find_parent(model, layer_name)
+        node = layer_name.split(".")[-1]
+
+        if tensor_name == "bias":
+            # this should already be set
+            return
+
+        # set module state_dict
+        module_state_dict = {}
+        for k, v in state_dict.items():
+            if layer_name + "." in k:
+                module_state_dict[k.split(".")[-1]] = v
+                if unexpected_keys is not None and k in unexpected_keys:
+                    unexpected_keys.remove(k)
+
+        if self.pre_quantized:
+            if isinstance(module, HQQLinear):
+                return
+            else:
+                hqq_layer = HQQLinear(
+                    linear_layer=None,
+                    quant_config=None,
+                    compute_dtype=self.torch_dtype,
+                    device=target_device,
+                )
+
+            hqq_layer.load_state_dict(module_state_dict)
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+
+            # cleanup
+            del module.__dict__, module
+            torch.cuda.empty_cache()
+            return
+
+        # Step 1: populate module with weight/bias from module state dict
+        for key in module_state_dict:
+            setattr(module, key, torch.nn.Parameter(module_state_dict[key]))
+
+        # Step 2: Replace module with either HQQLinear or move it to device. We do this via setattr on the parent as doing on it on the module
+        # directly doesn't work.
+        quant_config = model.config.quantization_config["quant_config"]
+        skip_modules = model.config.quantization_config["skip_modules"]
+        module_tag = ".".join(module.name.split(".")[-2:])
+        module_quant_config = None
+        if "weight_quant_params" in quant_config:
+            module_quant_config = quant_config
+        elif module_tag in quant_config:
+            module_quant_config = quant_config[module_tag]
+
+        for skip_module in skip_modules:
+            if skip_module in module.name:
+                module_quant_config = None
+                break
+
+        if module_quant_config is not None:
+            hqq_layer = HQQLinear(
+                module,
+                quant_config=module_quant_config,
+                compute_dtype=self.torch_dtype,
+                device=target_device,
+                del_orig=True,
+            )
+
+            if hqq_layer.bias is not None and isinstance(hqq_layer.bias, torch.Tensor):
+                hqq_layer.bias = torch.nn.Parameter(hqq_layer.bias)
+
+            if self.using_multi_gpu:
+                hqq_layer = self._patch_layer_for_multigpu(hqq_layer)
+
+            setattr(parent_module, node, hqq_layer)
+
+        else:
+            module = module.to(dtype=self.torch_dtype, device=target_device)
+            setattr(parent_module, node, module)
+
+        torch.cuda.empty_cache()
+
+    # Remove accelerate hook and uses a simpler forward pass. Otherwise, this breaks with multi-gpu
+    def _patch_layer_for_multigpu(self, hqq_layer):
+        hqq_layer = remove_hook_from_module(hqq_layer)
+
+        def forward_with_device(self, x):
+            out = torch.matmul(x.to(self.device), self.dequantize().t())
+            if self.bias is not None:
+                out += self.bias
+            return out
+
+        hqq_layer.forward = lambda x: forward_with_device(hqq_layer, x)
+        return hqq_layer
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        **kwargs,
+    ):
+        # Add the corresponding quant_config to each valid module. This allows us to do the actual nn.Linear -> HQQLinear conversion in create_quantized_param().
+        # prepare_for_hqq_linear() also sets the right quantization config inside the model (model.config.quantization_config) and the layers (hqq_layer.quant_config)
+        model = prepare_for_hqq_linear(model, quantization_config=self.quantization_config)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        model.is_hqq_quantized = True
+        model.is_hqq_serializable = self.is_serializable()
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        return True
+
+    @property
+    def is_trainable(self) -> bool:
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_quanto.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_quanto.py
new file mode 100644
index 0000000000000000000000000000000000000000..be760f0d430c16c5a81036972689ebca553bb20f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_quanto.py
@@ -0,0 +1,201 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import (
+    is_accelerate_available,
+    is_optimum_quanto_available,
+    is_torch_available,
+    logging,
+)
+from ..utils.quantization_config import QuantoConfig
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class QuantoHfQuantizer(HfQuantizer):
+    """
+    Quantizer for the quanto library
+    """
+
+    required_packages = ["quanto", "accelerate"]
+    requires_parameters_quantization = True
+    requires_calibration = False
+
+    def __init__(self, quantization_config: QuantoConfig, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker
+        """
+        if self.quantization_config.activations is not None and not self.pre_quantized:
+            raise ValueError(
+                "We don't support quantizing the activations with transformers library."
+                "Use quanto library for more complex use cases such as activations quantization, calibration and quantization aware training."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_optimum_quanto_available():
+            raise ImportError(
+                "Loading an optimum-quanto quantized model requires optimum-quanto library (`pip install optimum-quanto`)"
+            )
+        if not is_accelerate_available():
+            raise ImportError(
+                "Loading an optimum-quanto quantized model requires accelerate library (`pip install accelerate`)"
+            )
+
+    def update_device_map(self, device_map):
+        if device_map is None:
+            device_map = {"": "cpu"}
+            logger.info(
+                "The device_map was not initialized. "
+                "Setting device_map to {'':'cpu'}. "
+                "If you want to use the model for inference, please set device_map ='auto'"
+            )
+        return device_map
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.")
+            torch_dtype = torch.float32
+        return torch_dtype
+
+    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+
+        not_missing_keys = []
+        for name, module in model.named_modules():
+            if isinstance(module, QModuleMixin):
+                for missing in missing_keys:
+                    if (
+                        (name in missing or name in f"{prefix}.{missing}")
+                        and not missing.endswith(".weight")
+                        and not missing.endswith(".bias")
+                    ):
+                        not_missing_keys.append(missing)
+        return [k for k in missing_keys if k not in not_missing_keys]
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        """
+        Check if a parameter needs to be quantized.
+        """
+        if is_optimum_quanto_available():
+            from optimum.quanto import QModuleMixin
+
+        device_map = kwargs.get("device_map", None)
+        param_device = kwargs.get("param_device", None)
+        # we don't quantize the model if the module is going to be offloaded to the cpu
+        if device_map is not None and param_device is not None:
+            device_map_values = set(device_map.values())
+            if param_device == "cpu" and len(device_map_values) > 1:
+                if not (device_map_values == {"cpu"} or device_map_values == {"cpu", "disk"}):
+                    return False
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        # We only quantize the weights and the bias is not quantized.
+        if isinstance(module, QModuleMixin) and "weight" in tensor_name:
+            # if the weights are quantized, don't need to recreate it again with `create_quantized_param`
+            return not module.frozen
+        else:
+            return False
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        max_memory = {key: val * 0.90 for key, val in max_memory.items()}
+        return max_memory
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        *args,
+        **kwargs,
+    ):
+        """
+        Create the quantized parameter by calling .freeze() after setting it to the module.
+        """
+        from accelerate.utils import set_module_tensor_to_device
+
+        set_module_tensor_to_device(model, param_name, target_device, param_value)
+        module, _ = get_module_from_name(model, param_name)
+        module.freeze()
+        module.weight.requires_grad = False
+
+    def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.27.0"):
+            from accelerate.utils import CustomDtype
+
+            mapping = {
+                "int8": torch.int8,
+                "float8": CustomDtype.FP8,
+                "int4": CustomDtype.INT4,
+                "int2": CustomDtype.INT2,
+            }
+            target_dtype = mapping[self.quantization_config.weights]
+            return target_dtype
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on an optimum-quanto quantized model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library,"
+                "`pip install --upgrade accelerate` or install it from source."
+            )
+
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
+    ):
+        from ..integrations import replace_with_quanto_layers
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        model, _ = replace_with_quanto_layers(
+            model, modules_to_not_convert=self.modules_to_not_convert, quantization_config=self.quantization_config
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return True
+
+    def is_serializable(self, safe_serialization=None):
+        return False
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_quark.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_quark.py
new file mode 100644
index 0000000000000000000000000000000000000000..374360b1cb8f20724d74fe3d59cb78b0b47b2912
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_quark.py
@@ -0,0 +1,113 @@
+# coding=utf-8
+# Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict
+
+from ..file_utils import is_torch_available
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+    if is_torch_available():
+        import torch
+
+from ..utils import is_accelerate_available, is_quark_available, logging
+
+
+if is_accelerate_available():
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)
+
+
+CHECKPOINT_KEYS = {
+    "weight_scale": "weight_quantizer.scale",
+    "bias_scale": "bias_quantizer.scale",
+    "input_scale": "input_quantizer.scale",
+    "output_scale": "output_quantizer.scale",
+    "weight_zero_point": "weight_quantizer.zero_point",
+    "bias_zero_point": "bias_quantizer.zero_point",
+    "input_zero_point": "input_quantizer.zero_point",
+    "output_zero_point": "output_quantizer.zero_point",
+}
+
+
+class QuarkHfQuantizer(HfQuantizer):
+    """
+    Quark quantizer (https://quark.docs.amd.com/latest/).
+    """
+
+    requires_calibration = True  # On-the-fly quantization with quark is not supported for now.
+    required_packages = ["quark"]
+
+    # Checkpoints are expected to be already quantized when loading a quark model. However, as some keys from
+    # the checkpoint might mismatch the model parameters keys, we use the `create_quantized_param` method
+    # to load the checkpoints, remapping the keys.
+    requires_parameters_quantization = True
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+        self.json_export_config = quantization_config.json_export_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_quark_available():
+            raise ImportError(
+                "Loading a Quark quantized model requires the `quark` library but it was not found in the environment. Please refer to https://quark.docs.amd.com/latest/install.html."
+            )
+
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        from quark.torch.export.api import _map_to_quark
+
+        _map_to_quark(
+            model,
+            self.quantization_config.quant_config,
+            pack_method=self.json_export_config.pack_method,
+            custom_mode=self.quantization_config.custom_mode,
+        )
+
+        return model
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        return True
+
+    def create_quantized_param(
+        self, model, param, param_name, param_device, state_dict, unexpected_keys
+    ) -> "torch.nn.Parameter":
+        postfix = param_name.split(".")[-1]
+
+        if postfix in CHECKPOINT_KEYS:
+            param_name = param_name.replace(postfix, CHECKPOINT_KEYS[postfix])
+
+        set_module_tensor_to_device(model, param_name, param_device, value=param)
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    def is_serializable(self, safe_serialization=None):
+        return False
+
+    @property
+    def is_trainable(self):
+        return False
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_spqr.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_spqr.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cf1193edbf2cc676704db06fda7505c824548c0
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_spqr.py
@@ -0,0 +1,90 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/lic enses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..integrations import replace_with_spqr_linear
+from ..utils import is_accelerate_available, is_spqr_available, is_torch_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class SpQRHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the SpQR method. Enables the loading of prequantized models.
+    """
+
+    requires_calibration = True
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not torch.cuda.is_available():
+            raise RuntimeError("GPU is required to run SpQR quantized model.")
+
+        if not is_accelerate_available():
+            raise ImportError("Using `spqr` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_spqr_available():
+            raise ImportError("Using `spqr` quantization requires SpQR: `pip install spqr_quant[gpu]`")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            torch_dtype = torch.float16
+            logger.info("Assuming SpQR inference on GPU and loading the model in `torch.float16`.")
+        elif torch_dtype != torch.float16:
+            raise ValueError(
+                "You cannot use any type other than torch.float16 for SpQR. Please either leave it None or set it to"
+                "torch.float16 explicitly."
+            )
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        replace_with_spqr_linear(
+            model,
+            quantization_config=self.quantization_config,
+            modules_to_not_convert=self.modules_to_not_convert,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_torchao.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_torchao.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5e02f5387ad7eb0f899c706b8d082fe5a49d669
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_torchao.py
@@ -0,0 +1,290 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import re
+import types
+from typing import TYPE_CHECKING, Optional, Union
+
+from packaging import version
+
+from .base import HfQuantizer
+from .quantizers_utils import get_module_from_name
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from typing import Any, Dict, List
+
+from ..utils import is_torch_available, is_torchao_available, logging
+from ..utils.quantization_config import TorchAoConfig
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+logger = logging.get_logger(__name__)
+
+
+def fuzzy_match_size(config_name: str) -> Optional[str]:
+    """
+    Extract the size digit from strings like "4weight", "8weight".
+    Returns the digit as an integer if found, otherwise None.
+    """
+    config_name = config_name.lower()
+
+    str_match = re.search(r"(\d)weight", config_name)
+
+    if str_match:
+        return str_match.group(1)
+
+    return None
+
+
+# Finds the parent of a node module named "name"
+def find_parent(model, name):
+    module_tree = name.split(".")[:-1]
+    parent = model
+    for m in module_tree:
+        parent = parent._modules[m]
+    return parent
+
+
+def _quantization_type(weight):
+    from torchao.dtypes import AffineQuantizedTensor
+    from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor
+
+    if isinstance(weight, AffineQuantizedTensor):
+        return f"{weight.__class__.__name__}({weight._quantization_type()})"
+
+    if isinstance(weight, LinearActivationQuantizedTensor):
+        return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})"
+
+
+def _linear_extra_repr(self):
+    weight = _quantization_type(self.weight)
+    if weight is None:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None"
+    else:
+        return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}"
+
+
+class TorchAoHfQuantizer(HfQuantizer):
+    """
+    Quantizer for torchao: https://github.com/pytorch/ao/
+    """
+
+    requires_parameters_quantization = True
+    requires_calibration = False
+    required_packages = ["torchao"]
+
+    def __init__(self, quantization_config, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_torchao_available():
+            raise ImportError("Loading an torchao quantized model requires torchao library (`pip install torchao`)")
+
+        self.offload = False
+        device_map = kwargs.get("device_map", None)
+        if isinstance(device_map, dict):
+            if "cpu" in device_map.values() or "disk" in device_map.values():
+                if self.pre_quantized:
+                    raise ValueError(
+                        "You are attempting to perform cpu/disk offload with a pre-quantized torchao model "
+                        "This is not supported yet . Please remove the CPU or disk device from the device_map."
+                    )
+                else:
+                    self.offload = True
+        if self.pre_quantized:
+            weights_only = kwargs.get("weights_only", None)
+            if weights_only:
+                torch_version = version.parse(importlib.metadata.version("torch"))
+                if torch_version < version.parse("2.5.0"):
+                    raise RuntimeError(
+                        f"In order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is {torch_version}."
+                        f" You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch"
+                    )
+
+    def update_torch_dtype(self, torch_dtype):
+        if self.quantization_config.quant_type == "int4_weight_only":
+            if torch_dtype is not None and torch_dtype != torch.bfloat16:
+                logger.warning_once(
+                    f"Setting torch_dtype to {torch_dtype} for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the torch_dtype to bfloat16."
+                )
+            if torch_dtype is None:
+                logger.warning_once(
+                    "Setting torch_dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set torch_dtype=torch.bfloat16 to remove this warning."
+                )
+                torch_dtype = torch.bfloat16
+        if self.quantization_config.quant_type == "int8_dynamic_activation_int8_weight":
+            if torch_dtype is None:
+                logger.info(
+                    "Setting torch_dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no torch_dtype was specified in from_pretrained"
+                )
+                # we need to set the torch_dtype, otherwise we have dtype mismatch when performing the quantized linear op
+                torch_dtype = torch.float32
+        return torch_dtype
+
+    def adjust_target_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if version.parse(importlib.metadata.version("accelerate")) > version.parse("0.19.0"):
+            from accelerate.utils import CustomDtype
+
+            # Import AOBaseConfig directly since we know we have the right version
+            if self.quantization_config._get_ao_version() > version.Version("0.9.0"):
+                from torchao.core.config import AOBaseConfig
+
+                quant_type = self.quantization_config.quant_type
+                if isinstance(quant_type, AOBaseConfig):
+                    # Extract size digit using fuzzy match on the class name
+                    config_name = quant_type.__class__.__name__
+                    size_digit = fuzzy_match_size(config_name)
+
+                    # Map the extracted digit to appropriate dtype
+                    if size_digit == "4":
+                        return CustomDtype.INT4
+                    else:
+                        # Default to int8
+                        return torch.int8
+
+            # Original mapping for non-AOBaseConfig types
+            map_to_target_dtype = {
+                "int4_weight_only": CustomDtype.INT4,
+                "int8_weight_only": torch.int8,
+                "int8_dynamic_activation_int8_weight": torch.int8,
+                "autoquant": None,
+            }
+            return map_to_target_dtype[self.quantization_config.quant_type]
+        else:
+            raise ValueError(
+                "You are using `device_map='auto'` on a torchao quantized model. To automatically compute"
+                " the appropriate device map, you should upgrade your `accelerate` library with "
+                "`pip install --upgrade accelerate`"
+            )
+
+    def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]:
+        # need more space for the quantization parameters (e.g. scale). Tested with int4 wo and group size = 128
+        max_memory = {key: val * 0.9 for key, val in max_memory.items()}
+        return max_memory
+
+    def _process_model_before_weight_loading(
+        self, model: "PreTrainedModel", keep_in_fp32_modules: Optional[List[str]] = None, **kwargs
+    ):
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+        return
+
+    def check_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        state_dict: Dict[str, Any],
+        **kwargs,
+    ) -> bool:
+        if self.quantization_config.quant_type == "autoquant":
+            return False
+
+        param_device = kwargs.pop("param_device", None)
+        # check if the param_name is not in self.modules_to_not_convert
+        if any((key + "." in param_name) or (key == param_name) for key in self.modules_to_not_convert):
+            return False
+        elif param_device == "cpu" and self.offload:
+            # We don't quantize weights that we offload
+            return False
+        else:
+            # we only quantize the weight of nn.Linear
+            module, tensor_name = get_module_from_name(model, param_name)
+            return isinstance(module, torch.nn.Linear) and (tensor_name == "weight")
+
+    def create_quantized_param(
+        self,
+        model: "PreTrainedModel",
+        param_value: "torch.Tensor",
+        param_name: str,
+        target_device: "torch.device",
+        state_dict: Dict[str, Any],
+        unexpected_keys: List[str],
+    ):
+        """
+        Each nn.Linear layer that needs to be quantized is processsed here.
+        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
+        """
+        if self.quantization_config.quant_type == "autoquant":
+            return
+
+        from torchao.quantization import quantize_
+
+        module, tensor_name = get_module_from_name(model, param_name)
+        if self.pre_quantized:
+            module._parameters[tensor_name] = torch.nn.Parameter(
+                param_value.to(device=target_device), requires_grad=param_value.requires_grad
+            )
+            if isinstance(module, nn.Linear):
+                module.extra_repr = types.MethodType(_linear_extra_repr, module)
+        else:
+            assert isinstance(self.quantization_config, TorchAoConfig)
+            module._parameters[tensor_name] = torch.nn.Parameter(
+                param_value, requires_grad=param_value.requires_grad
+            ).to(device=target_device)
+            quantize_(module, self.quantization_config.get_apply_tensor_subclass())
+
+    def _process_model_after_weight_loading(self, model, **kwargs):
+        """No process required for torchao quantized model"""
+        if self.quantization_config.quant_type == "autoquant":
+            from torchao import autoquant
+            from torchao.quantization import ALL_AUTOQUANT_CLASS_LIST
+
+            model = torch.compile(model, mode="max-autotune")
+            model = autoquant(
+                model,
+                qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST,
+                set_inductor_config=False,
+                **self.quantization_config.quant_type_kwargs,
+            )
+            return model
+        return
+
+    def is_serializable(self, safe_serialization=None) -> bool:
+        if safe_serialization:
+            logger.warning(
+                "torchao quantized model does not support safe serialization, please set `safe_serialization` to False"
+            )
+            return False
+        _is_torchao_serializable = version.parse(importlib.metadata.version("huggingface_hub")) >= version.parse(
+            "0.25.0"
+        )
+        if not _is_torchao_serializable:
+            logger.warning("torchao quantized model is only serializable after huggingface_hub >= 0.25.0 ")
+        if self.offload and self.quantization_config.modules_to_not_convert is None:
+            logger.warning(
+                "The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them."
+                "If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config."
+            )
+            return False
+        return _is_torchao_serializable
+
+    @property
+    def is_trainable(self) -> bool:
+        supported_quant_types_for_training = [
+            "int8_weight_only",
+            "int8_dynamic_activation_int8_weight",
+        ]
+        return self.quantization_config.quant_type in supported_quant_types_for_training
+
+    @property
+    def is_compileable(self) -> bool:
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizer_vptq.py b/docs/transformers/build/lib/transformers/quantizers/quantizer_vptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..85483357448f15df66dc77b4e0e9158970ab1764
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizer_vptq.py
@@ -0,0 +1,99 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, List, Optional
+
+from .base import HfQuantizer
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+
+from ..utils import is_accelerate_available, is_torch_available, is_vptq_available, logging
+from ..utils.quantization_config import QuantizationConfigMixin
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class VptqHfQuantizer(HfQuantizer):
+    """
+    Quantizer of the VPTQ method. Enables the loading of prequantized models.
+    """
+
+    requires_calibration = True
+    required_packages = ["vptq"]
+
+    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
+        super().__init__(quantization_config, **kwargs)
+        self.quantization_config = quantization_config
+
+    def validate_environment(self, *args, **kwargs):
+        if not is_accelerate_available():
+            raise ImportError("Using `vptq` quantization requires Accelerate: `pip install accelerate`")
+
+        if not is_vptq_available():
+            raise ImportError("Using `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`")
+
+    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
+        if torch_dtype is None:
+            if torch.cuda.is_available():
+                torch_dtype = torch.float16
+                logger.info(
+                    "CUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually."
+                )
+            else:
+                import vptq
+
+                device_availability = getattr(vptq, "device_availability", lambda device: False)
+                if device_availability("cpu") is True:
+                    raise RuntimeError("No GPU found. Please wait for the next release of VPTQ to use CPU inference")
+                torch_dtype = torch.float32
+                logger.info("No GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.")
+        return torch_dtype
+
+    def _process_model_before_weight_loading(
+        self,
+        model: "PreTrainedModel",
+        keep_in_fp32_modules: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        we don't have param like modules_to_not_convert to indicate which layers should not be quantized
+        because `quantization_config` include the layers that should be quantized
+        """
+        from ..integrations import replace_with_vptq_linear
+
+        self.modules_to_not_convert = self.get_modules_to_not_convert(
+            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
+        )
+
+        replace_with_vptq_linear(
+            model,
+            quantization_config=self.quantization_config,
+            modules_to_not_convert=self.modules_to_not_convert,
+        )
+        model.config.quantization_config = self.quantization_config
+
+    def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
+        return model
+
+    @property
+    def is_trainable(self, model: Optional["PreTrainedModel"] = None):
+        return False
+
+    def is_serializable(self, safe_serialization=None):
+        return True
diff --git a/docs/transformers/build/lib/transformers/quantizers/quantizers_utils.py b/docs/transformers/build/lib/transformers/quantizers/quantizers_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbed8317049f9e2530c03dd40730e9c79cb879b2
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/quantizers/quantizers_utils.py
@@ -0,0 +1,21 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Tuple
+
+
+def get_module_from_name(module, tensor_name: str) -> Tuple[Any, str]:
+    if "." in tensor_name:
+        module_name, tensor_name = tensor_name.rsplit(".", 1)
+        module = module.get_submodule(module_name)
+    return module, tensor_name
diff --git a/docs/transformers/build/lib/transformers/sagemaker/__init__.py b/docs/transformers/build/lib/transformers/sagemaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98fe38de89cd025911d03669f9e22b03ab0768bd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/sagemaker/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainer_sm import SageMakerTrainer
+from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_dp_enabled
diff --git a/docs/transformers/build/lib/transformers/sagemaker/trainer_sm.py b/docs/transformers/build/lib/transformers/sagemaker/trainer_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab4e01acdbcd3ade1afc2339a75850bc538bd7a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/sagemaker/trainer_sm.py
@@ -0,0 +1,30 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+from ..trainer import Trainer
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SageMakerTrainer(Trainer):
+    def __init__(self, args=None, **kwargs):
+        warnings.warn(
+            "`SageMakerTrainer` is deprecated and will be removed in v5 of Transformers. You can use `Trainer` "
+            "instead.",
+            FutureWarning,
+        )
+        super().__init__(args=args, **kwargs)
diff --git a/docs/transformers/build/lib/transformers/sagemaker/training_args_sm.py b/docs/transformers/build/lib/transformers/sagemaker/training_args_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3daac7859b550de31f211a5e7c9938d8d557fc4c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/sagemaker/training_args_sm.py
@@ -0,0 +1,136 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import json
+import os
+import warnings
+from dataclasses import dataclass, field
+
+import torch
+
+from ..training_args import TrainingArguments
+from ..utils import cached_property, is_sagemaker_dp_enabled, logging
+
+
+logger = logging.get_logger(__name__)
+
+# TODO: should be moved to `utils` after refactoring of SageMakerTrainer
+
+
+def is_sagemaker_model_parallel_available():
+    # Get the sagemaker specific mp parameters from smp_options variable.
+    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+    try:
+        # Parse it and check the field "partitions" is included, it is required for model parallel.
+        smp_options = json.loads(smp_options)
+        if "partitions" not in smp_options:
+            return False
+    except json.JSONDecodeError:
+        return False
+
+    # Get the sagemaker specific framework parameters from mpi_options variable.
+    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        mpi_options = json.loads(mpi_options)
+        if not mpi_options.get("sagemaker_mpi_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return importlib.util.find_spec("smdistributed") is not None
+
+
+if is_sagemaker_model_parallel_available():
+    import smdistributed.modelparallel.torch as smp
+
+    smp.init()
+
+
+@dataclass
+class SageMakerTrainingArguments(TrainingArguments):
+    mp_parameters: str = field(
+        default="",
+        metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in SageMakerTrainer"},
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        warnings.warn(
+            "`SageMakerTrainingArguments` is deprecated and will be removed in v5 of Transformers. You can use "
+            "`TrainingArguments` instead.",
+            FutureWarning,
+        )
+
+    @cached_property
+    def _setup_devices(self) -> "torch.device":
+        logger.info("PyTorch: setting up devices")
+        if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1:
+            logger.warning(
+                "torch.distributed process group is initialized, but local_rank == -1. "
+                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+            )
+        if self.no_cuda:
+            device = torch.device("cpu")
+            self._n_gpu = 0
+        elif is_sagemaker_model_parallel_available():
+            local_rank = smp.local_rank()
+            device = torch.device("cuda", local_rank)
+            self._n_gpu = 1
+        elif is_sagemaker_dp_enabled():
+            import smdistributed.dataparallel.torch.torch_smddp  # noqa: F401
+
+            torch.distributed.init_process_group(backend="smddp", timeout=self.ddp_timeout_delta)
+            self.local_rank = int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))
+            device = torch.device("cuda", self.local_rank)
+            self._n_gpu = 1
+        elif self.local_rank == -1:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+            # trigger an error that a device index is missing. Index 0 takes into account the
+            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+            # will use the first GPU in that env, i.e. GPU#1
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
+            # the default value.
+            self._n_gpu = torch.cuda.device_count()
+        else:
+            # Here, we'll use torch.distributed.
+            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
+            if not torch.distributed.is_initialized():
+                torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
+            device = torch.device("cuda", self.local_rank)
+            self._n_gpu = 1
+
+        if device.type == "cuda":
+            torch.cuda.set_device(device)
+
+        return device
+
+    @property
+    def world_size(self):
+        if is_sagemaker_model_parallel_available():
+            return smp.dp_size()
+
+        return super().world_size
+
+    @property
+    def place_model_on_device(self):
+        return not is_sagemaker_model_parallel_available()
+
+    @property
+    def _no_sync_in_gradient_accumulation(self):
+        return False
diff --git a/docs/transformers/build/lib/transformers/utils/__init__.py b/docs/transformers/build/lib/transformers/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3e75cc5ab643265534d3a32dc923f63c3eb08f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/__init__.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python
+
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache
+
+from huggingface_hub import get_full_repo_name  # for backward compatibility
+from huggingface_hub.constants import HF_HUB_DISABLE_TELEMETRY as DISABLE_TELEMETRY  # for backward compatibility
+from packaging import version
+
+from .. import __version__
+from .backbone_utils import BackboneConfigMixin, BackboneMixin
+from .chat_template_utils import DocstringParsingException, TypeHintParsingException, get_json_schema
+from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
+from .doc import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    copy_func,
+    replace_return_docstrings,
+)
+from .generic import (
+    ContextManagers,
+    ExplicitEnum,
+    LossKwargs,
+    ModelOutput,
+    PaddingStrategy,
+    TensorType,
+    add_model_info_to_auto_map,
+    add_model_info_to_custom_pipelines,
+    cached_property,
+    can_return_loss,
+    can_return_tuple,
+    expand_dims,
+    filter_out_non_signature_kwargs,
+    find_labels,
+    flatten_dict,
+    infer_framework,
+    is_jax_tensor,
+    is_numpy_array,
+    is_tensor,
+    is_tf_symbolic_tensor,
+    is_tf_tensor,
+    is_timm_config_dict,
+    is_timm_local_checkpoint,
+    is_torch_device,
+    is_torch_dtype,
+    is_torch_tensor,
+    reshape,
+    squeeze,
+    strtobool,
+    tensor_size,
+    to_numpy,
+    to_py_obj,
+    torch_float,
+    torch_int,
+    transpose,
+    working_or_temp_dir,
+)
+from .hub import (
+    CHAT_TEMPLATE_DIR,
+    CHAT_TEMPLATE_FILE,
+    CLOUDFRONT_DISTRIB_PREFIX,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_PREFIX,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE,
+    PYTORCH_PRETRAINED_BERT_CACHE,
+    PYTORCH_TRANSFORMERS_CACHE,
+    S3_BUCKET_PREFIX,
+    TRANSFORMERS_CACHE,
+    TRANSFORMERS_DYNAMIC_MODULE_NAME,
+    EntryNotFoundError,
+    PushInProgress,
+    PushToHubMixin,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    cached_file,
+    default_cache_path,
+    define_sagemaker_information,
+    download_url,
+    extract_commit_hash,
+    has_file,
+    http_user_agent,
+    is_offline_mode,
+    is_remote_url,
+    list_repo_templates,
+    send_example_telemetry,
+    try_to_load_from_cache,
+)
+from .import_utils import (
+    ACCELERATE_MIN_VERSION,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    GGUF_MIN_VERSION,
+    TORCH_FX_REQUIRED_VERSION,
+    USE_JAX,
+    USE_TF,
+    USE_TORCH,
+    XLA_FSDPV2_MIN_VERSION,
+    DummyObject,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    ccl_version,
+    direct_transformers_import,
+    get_torch_version,
+    is_accelerate_available,
+    is_apex_available,
+    is_apollo_torch_available,
+    is_aqlm_available,
+    is_auto_awq_available,
+    is_auto_gptq_available,
+    is_auto_round_available,
+    is_av_available,
+    is_bitsandbytes_available,
+    is_bitsandbytes_multi_backend_available,
+    is_bs4_available,
+    is_coloredlogs_available,
+    is_compressed_tensors_available,
+    is_cv2_available,
+    is_cython_available,
+    is_datasets_available,
+    is_decord_available,
+    is_detectron2_available,
+    is_eetq_available,
+    is_essentia_available,
+    is_faiss_available,
+    is_fbgemm_gpu_available,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal,
+    is_flash_attn_greater_or_equal_2_10,
+    is_flax_available,
+    is_flute_available,
+    is_fsdp_available,
+    is_ftfy_available,
+    is_g2p_en_available,
+    is_galore_torch_available,
+    is_gguf_available,
+    is_gptqmodel_available,
+    is_grokadamw_available,
+    is_habana_gaudi1,
+    is_hadamard_available,
+    is_hqq_available,
+    is_in_notebook,
+    is_ipex_available,
+    is_jieba_available,
+    is_jinja_available,
+    is_jumanpp_available,
+    is_kenlm_available,
+    is_keras_nlp_available,
+    is_kernels_available,
+    is_levenshtein_available,
+    is_librosa_available,
+    is_liger_kernel_available,
+    is_lomo_available,
+    is_mlx_available,
+    is_natten_available,
+    is_ninja_available,
+    is_nltk_available,
+    is_num2words_available,
+    is_onnx_available,
+    is_openai_available,
+    is_optimum_available,
+    is_optimum_quanto_available,
+    is_pandas_available,
+    is_peft_available,
+    is_phonemizer_available,
+    is_pretty_midi_available,
+    is_protobuf_available,
+    is_psutil_available,
+    is_py3nvml_available,
+    is_pyctcdecode_available,
+    is_pytesseract_available,
+    is_pytest_available,
+    is_pytorch_quantization_available,
+    is_quark_available,
+    is_rich_available,
+    is_rjieba_available,
+    is_sacremoses_available,
+    is_safetensors_available,
+    is_sagemaker_dp_enabled,
+    is_sagemaker_mp_enabled,
+    is_schedulefree_available,
+    is_scipy_available,
+    is_sentencepiece_available,
+    is_seqio_available,
+    is_sklearn_available,
+    is_soundfile_available,
+    is_spacy_available,
+    is_speech_available,
+    is_spqr_available,
+    is_sudachi_available,
+    is_sudachi_projection_available,
+    is_tensorflow_probability_available,
+    is_tensorflow_text_available,
+    is_tf2onnx_available,
+    is_tf_available,
+    is_tiktoken_available,
+    is_timm_available,
+    is_tokenizers_available,
+    is_torch_accelerator_available,
+    is_torch_available,
+    is_torch_bf16_available,
+    is_torch_bf16_available_on_device,
+    is_torch_bf16_cpu_available,
+    is_torch_bf16_gpu_available,
+    is_torch_compile_available,
+    is_torch_cuda_available,
+    is_torch_deterministic,
+    is_torch_flex_attn_available,
+    is_torch_fp16_available_on_device,
+    is_torch_fx_available,
+    is_torch_fx_proxy,
+    is_torch_greater_or_equal,
+    is_torch_hpu_available,
+    is_torch_mlu_available,
+    is_torch_mps_available,
+    is_torch_musa_available,
+    is_torch_neuroncore_available,
+    is_torch_npu_available,
+    is_torch_sdpa_available,
+    is_torch_tensorrt_fx_available,
+    is_torch_tf32_available,
+    is_torch_xla_available,
+    is_torch_xpu_available,
+    is_torchao_available,
+    is_torchaudio_available,
+    is_torchdistx_available,
+    is_torchdynamo_available,
+    is_torchdynamo_compiling,
+    is_torchdynamo_exporting,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_training_run_on_sagemaker,
+    is_uroman_available,
+    is_vision_available,
+    is_vptq_available,
+    is_yt_dlp_available,
+    requires_backends,
+    torch_only_method,
+)
+from .peft_utils import (
+    ADAPTER_CONFIG_NAME,
+    ADAPTER_SAFE_WEIGHTS_NAME,
+    ADAPTER_WEIGHTS_NAME,
+    check_peft_version,
+    find_adapter_config_file,
+)
+
+
+WEIGHTS_NAME = "pytorch_model.bin"
+WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF2_WEIGHTS_INDEX_NAME = "tf_model.h5.index.json"
+TF_WEIGHTS_NAME = "model.ckpt"
+FLAX_WEIGHTS_NAME = "flax_model.msgpack"
+FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json"
+SAFE_WEIGHTS_NAME = "model.safetensors"
+SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
+CONFIG_NAME = "config.json"
+FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME
+PROCESSOR_NAME = "processor_config.json"
+GENERATION_CONFIG_NAME = "generation_config.json"
+MODEL_CARD_NAME = "modelcard.json"
+
+
+SENTENCEPIECE_UNDERLINE = "▁"
+SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
+
+MULTIPLE_CHOICE_DUMMY_INPUTS = [
+    [[0, 1, 0, 1], [1, 0, 0, 1]]
+] * 2  # Needs to have 0s and 1s only since XLM uses it for langs too.
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
+
+def check_min_version(min_version):
+    if version.parse(__version__) < version.parse(min_version):
+        if "dev" in min_version:
+            error_message = (
+                "This example requires a source install from HuggingFace Transformers (see "
+                "`https://huggingface.co/docs/transformers/installation#install-from-source`),"
+            )
+        else:
+            error_message = f"This example requires a minimum version of {min_version},"
+        error_message += f" but the version found is {__version__}.\n"
+        raise ImportError(
+            error_message
+            + "Check out https://github.com/huggingface/transformers/tree/main/examples#important-note for the examples corresponding to other "
+            "versions of HuggingFace Transformers."
+        )
+
+
+@lru_cache
+def get_available_devices() -> frozenset[str]:
+    """
+    Returns a frozenset of devices available for the current PyTorch installation.
+    """
+    devices = {"cpu"}  # `cpu` is always supported as a device in PyTorch
+
+    if is_torch_cuda_available():
+        devices.add("cuda")
+
+    if is_torch_mps_available():
+        devices.add("mps")
+
+    if is_torch_xpu_available():
+        devices.add("xpu")
+
+    if is_torch_npu_available():
+        devices.add("npu")
+
+    if is_torch_hpu_available():
+        devices.add("hpu")
+
+    if is_torch_mlu_available():
+        devices.add("mlu")
+
+    if is_torch_musa_available():
+        devices.add("musa")
+
+    return frozenset(devices)
diff --git a/docs/transformers/build/lib/transformers/utils/attention_visualizer.py b/docs/transformers/build/lib/transformers/utils/attention_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bad6a7471e54f12a47f854aca51138577bb99e6d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/attention_visualizer.py
@@ -0,0 +1,228 @@
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import requests
+from PIL import Image
+
+from ..models.auto.auto_factory import _get_model_class
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.modeling_auto import MODEL_FOR_PRETRAINING_MAPPING, MODEL_MAPPING
+from ..models.auto.processing_auto import PROCESSOR_MAPPING_NAMES, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES, AutoTokenizer
+from .import_utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+    import torch.nn as nn
+
+# Print the matrix with words as row labels
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+RESET = "\033[0m"
+BLACK_SQUARE = "■"
+WHITE_SQUARE = "⬚"
+
+
+def generate_attention_matrix_from_mask(words, mask, img_token="<img>", sliding_window=None, token_type_ids=None):
+    """
+    Generates an attention matrix from a given attention mask.
+
+    Optionally applies a sliding window mask (e.g., for Gemma2/3) and
+    marks regions where image tokens occur based on the specified `img_token`.
+    """
+    mask = mask.int()
+    if mask.ndim == 3:
+        mask = mask[0, :, :]
+    if mask.ndim == 4:
+        mask = mask[0, 0, :, :]
+
+    n = len(words)
+    max_word_length = max(len(repr(word)) for word in words)
+    first_img_idx = 0
+    output = []
+
+    for i, k in enumerate(words):
+        if k == img_token and not first_img_idx:
+            first_img_idx = i
+            mask[i, i] = 2  # Mark yellow regions
+        if first_img_idx > 0 and (k != img_token or i == n - 1):
+            if i == n - 1:
+                i += 1
+            mask[first_img_idx:i, first_img_idx:i] = 2  # Mark yellow regions
+            first_img_idx = 0
+
+    # Generate sliding window mask (size = 4), excluding img_token
+    sliding_window_mask = None
+    if sliding_window is not None:
+        sliding_window_mask = [[1 if (0 <= i - j < sliding_window) else 0 for j in range(n)] for i in range(n)]
+
+    row_dummy = " ".join(
+        f"{YELLOW}{BLACK_SQUARE}{RESET}"
+        if mask[0, j]
+        else f"{GREEN}{BLACK_SQUARE}{RESET}"
+        if 0 == j
+        else BLACK_SQUARE
+        if mask[0, j]
+        else WHITE_SQUARE
+        for j in range(n)
+    )
+
+    # Print headers
+    legend = f"{GREEN}{BLACK_SQUARE}{RESET}: i == j (diagonal)   {YELLOW}{BLACK_SQUARE}{RESET}: token_type_ids"
+    output.append(" " + legend)
+    f_string = " " * (max_word_length + 5) + "Attention Matrix".ljust(len(row_dummy) // 2)
+    if sliding_window is not None:
+        f_string += "Sliding Window Mask"
+    output.append(f_string)
+
+    vertical_header = []
+    for idx, word in enumerate(words):
+        if mask[idx, idx] == 2:
+            vertical_header.append([f"{YELLOW}{k}{RESET}" for k in list(str(idx).rjust(len(str(n))))])
+        else:
+            vertical_header.append(list(str(idx).rjust(len(str(n)))))
+
+    vertical_header = list(map(list, zip(*vertical_header)))  # Transpose
+
+    for row in vertical_header:
+        output.append(
+            (max_word_length + 5) * " " + " ".join(row) + "    |    " + " ".join(row)
+            if sliding_window is not None
+            else ""
+        )
+
+    for i, word in enumerate(words):
+        word_repr = repr(word).ljust(max_word_length)
+        colored_word = f"{YELLOW}{word_repr}{RESET}" if img_token in word else word_repr
+        row_display = " ".join(
+            f"{YELLOW}{BLACK_SQUARE}{RESET}"
+            if img_token in words[j] and mask[i, j] and img_token in words[i]
+            else f"{GREEN}{BLACK_SQUARE}{RESET}"
+            if i == j
+            else BLACK_SQUARE
+            if mask[i, j]
+            else WHITE_SQUARE
+            for j in range(n)
+        )
+        sliding_window_row = ""
+        if sliding_window is not None:
+            sliding_window_row = " ".join(
+                f"{YELLOW}{BLACK_SQUARE}{RESET}"
+                if img_token in words[j] and img_token in words[i]
+                else f"{GREEN}{BLACK_SQUARE}{RESET}"
+                if i == j
+                else BLACK_SQUARE
+                if sliding_window_mask[i][j]
+                else WHITE_SQUARE
+                for j in range(n)
+            )
+
+        output.append(f"{colored_word}: {str(i).rjust(2)} {row_display}    |    {sliding_window_row}")
+
+    return "\n".join(output)
+
+
+class AttentionMaskVisualizer:
+    def __init__(self, model_name: str):
+        config = AutoConfig.from_pretrained(model_name)
+        self.image_token = "<img>"
+        if hasattr(config.get_text_config(), "sliding_window"):
+            config.sliding_window = 5
+        try:
+            mapped_cls = _get_model_class(config, MODEL_MAPPING)
+        except Exception:
+            mapped_cls = _get_model_class(config, MODEL_FOR_PRETRAINING_MAPPING)
+
+        if mapped_cls is None:
+            raise ValueError(f"Model name {model_name} is not supported for attention visualization")
+        self.mapped_cls = mapped_cls
+
+        class _ModelWrapper(mapped_cls, nn.Module):
+            def __init__(self, config, model_name):
+                nn.Module.__init__(self)
+                self.dummy_module = nn.Linear(1, 1)
+                self.config = config
+
+        self.model = _ModelWrapper(config, model_name)
+        self.model.to(config.torch_dtype)
+        self.repo_id = model_name
+        self.config = config
+
+    def __call__(self, input_sentence: str, suffix=""):
+        self.visualize_attention_mask(input_sentence, suffix=suffix)
+
+    def visualize_attention_mask(self, input_sentence: str, suffix=""):
+        model = self.model
+        kwargs = {}
+        if self.config.model_type in PROCESSOR_MAPPING_NAMES:
+            img = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true"
+            img = Image.open(requests.get(img, stream=True).raw)
+            processor = AutoProcessor.from_pretrained(self.repo_id, image_seq_length=5)
+            if hasattr(processor, "image_token"):
+                image_token = processor.image_token
+            else:
+                image_token = processor.tokenizer.convert_ids_to_tokens([processor.image_token_id])[0]
+
+            if image_token:
+                input_sentence = input_sentence.replace("<img>", image_token)
+
+            inputs = processor(img, input_sentence, suffix=suffix, return_tensors="pt")
+
+            self.image_token = processor.tokenizer.convert_ids_to_tokens([processor.image_token_id])[0]
+
+            attention_mask = inputs["attention_mask"]
+            if "token_type_ids" in inputs:  # TODO inspect signature of update causal mask
+                kwargs["token_type_ids"] = inputs["token_type_ids"]
+            tokens = processor.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+        elif self.config.model_type in TOKENIZER_MAPPING_NAMES:
+            tokenizer = AutoTokenizer.from_pretrained(self.repo_id)
+            tokens = tokenizer.tokenize(input_sentence)
+            attention_mask = tokenizer(input_sentence, return_tensors="pt")["attention_mask"]
+        else:
+            raise ValueError(f"Model type {model.config.model_type} does not support attention visualization")
+
+        model.config._attn_implementation = "eager"
+        model.train()
+        attention_mask = ~model._update_causal_mask(
+            attention_mask=attention_mask,
+            input_tensor=attention_mask.to(self.model.dtype),
+            cache_position=torch.arange(attention_mask.shape[1]),
+            past_key_values=None,
+            **kwargs,
+        ).bool()
+        top_bottom_border = "##" * (
+            len(f"Attention visualization for {self.config.model_type} | {self.mapped_cls}") + 4
+        )  # Box width adjusted to text length
+        side_border = "##"
+        print(f"\n{top_bottom_border}")
+        print(
+            "##"
+            + f"  Attention visualization for \033[1m{self.config.model_type}:{self.repo_id}\033[0m {self.mapped_cls.__name__}".center(
+                len(top_bottom_border)
+            )
+            + "    "
+            + side_border
+        )
+        print(f"{top_bottom_border}")
+        f_string = generate_attention_matrix_from_mask(
+            tokens,
+            attention_mask,
+            img_token=self.image_token,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            token_type_ids=kwargs.get("token_type_ids", None),
+        )
+        print(f_string)
+        print(f"{top_bottom_border}")
diff --git a/docs/transformers/build/lib/transformers/utils/backbone_utils.py b/docs/transformers/build/lib/transformers/utils/backbone_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6ab0046bc5c77d9a18450bd7518df16c31faabd
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/backbone_utils.py
@@ -0,0 +1,377 @@
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Collection of utils to be used by backbones and their components."""
+
+import enum
+import inspect
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+
+if TYPE_CHECKING:
+    from ..configuration_utils import PretrainedConfig
+
+
+class BackboneType(enum.Enum):
+    TIMM = "timm"
+    TRANSFORMERS = "transformers"
+
+
+def verify_out_features_out_indices(
+    out_features: Optional[Iterable[str]], out_indices: Optional[Iterable[int]], stage_names: Optional[Iterable[str]]
+):
+    """
+    Verify that out_indices and out_features are valid for the given stage_names.
+    """
+    if stage_names is None:
+        raise ValueError("Stage_names must be set for transformers backbones")
+
+    if out_features is not None:
+        if not isinstance(out_features, (list,)):
+            raise ValueError(f"out_features must be a list got {type(out_features)}")
+        if any(feat not in stage_names for feat in out_features):
+            raise ValueError(f"out_features must be a subset of stage_names: {stage_names} got {out_features}")
+        if len(out_features) != len(set(out_features)):
+            raise ValueError(f"out_features must not contain any duplicates, got {out_features}")
+        if out_features != (sorted_feats := [feat for feat in stage_names if feat in out_features]):
+            raise ValueError(
+                f"out_features must be in the same order as stage_names, expected {sorted_feats} got {out_features}"
+            )
+
+    if out_indices is not None:
+        if not isinstance(out_indices, list):
+            raise ValueError(f"out_indices must be a list, got {type(out_indices)}")
+        # Convert negative indices to their positive equivalent: [-1,] -> [len(stage_names) - 1,]
+        positive_indices = tuple(idx % len(stage_names) if idx < 0 else idx for idx in out_indices)
+        if any(idx for idx in positive_indices if idx not in range(len(stage_names))):
+            raise ValueError(f"out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
+        if len(positive_indices) != len(set(positive_indices)):
+            msg = f"out_indices must not contain any duplicates, got {out_indices}"
+            msg += f"(equivalent to {positive_indices}))" if positive_indices != out_indices else ""
+            raise ValueError(msg)
+        if positive_indices != tuple(sorted(positive_indices)):
+            sorted_negative = [idx for _, idx in sorted(zip(positive_indices, out_indices), key=lambda x: x[0])]
+            raise ValueError(
+                f"out_indices must be in the same order as stage_names, expected {sorted_negative} got {out_indices}"
+            )
+
+    if out_features is not None and out_indices is not None:
+        if len(out_features) != len(out_indices):
+            raise ValueError("out_features and out_indices should have the same length if both are set")
+        if out_features != [stage_names[idx] for idx in out_indices]:
+            raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+
+
+def _align_output_features_output_indices(
+    out_features: Optional[list[str]],
+    out_indices: Optional[Union[list[int], tuple[int]]],
+    stage_names: list[str],
+):
+    """
+    Finds the corresponding `out_features` and `out_indices` for the given `stage_names`.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: input `out_indices` and `out_features` are returned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    if out_indices is None and out_features is None:
+        out_indices = [len(stage_names) - 1]
+        out_features = [stage_names[-1]]
+    elif out_indices is None and out_features is not None:
+        out_indices = [stage_names.index(layer) for layer in out_features]
+    elif out_features is None and out_indices is not None:
+        out_features = [stage_names[idx] for idx in out_indices]
+    return out_features, out_indices
+
+
+def get_aligned_output_features_output_indices(
+    out_features: Optional[list[str]],
+    out_indices: Optional[Union[list[int], tuple[int]]],
+    stage_names: list[str],
+) -> tuple[list[str], list[int]]:
+    """
+    Get the `out_features` and `out_indices` so that they are aligned.
+
+    The logic is as follows:
+        - `out_features` not set, `out_indices` set: `out_features` is set to the `out_features` corresponding to the
+        `out_indices`.
+        - `out_indices` not set, `out_features` set: `out_indices` is set to the `out_indices` corresponding to the
+        `out_features`.
+        - `out_indices` and `out_features` not set: `out_indices` and `out_features` are set to the last stage.
+        - `out_indices` and `out_features` set: they are verified to be aligned.
+
+    Args:
+        out_features (`List[str]`): The names of the features for the backbone to output.
+        out_indices (`List[int]` or `Tuple[int]`): The indices of the features for the backbone to output.
+        stage_names (`List[str]`): The names of the stages of the backbone.
+    """
+    out_indices = list(out_indices) if out_indices is not None else None
+    # First verify that the out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=out_features, out_indices=out_indices, stage_names=stage_names)
+    output_features, output_indices = _align_output_features_output_indices(
+        out_features=out_features, out_indices=out_indices, stage_names=stage_names
+    )
+    # Verify that the aligned out_features and out_indices are valid
+    verify_out_features_out_indices(out_features=output_features, out_indices=output_indices, stage_names=stage_names)
+    return output_features, output_indices
+
+
+class BackboneMixin:
+    backbone_type: Optional[BackboneType] = None
+
+    def _init_timm_backbone(self, config) -> None:
+        """
+        Initialize the backbone model from timm The backbone must already be loaded to self._backbone
+        """
+        if getattr(self, "_backbone", None) is None:
+            raise ValueError("self._backbone must be set before calling _init_timm_backbone")
+
+        # These will diagree with the defaults for the transformers models e.g. for resnet50
+        # the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
+        # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
+        self.stage_names = [stage["module"] for stage in self._backbone.feature_info.info]
+        self.num_features = [stage["num_chs"] for stage in self._backbone.feature_info.info]
+
+        # In some timm versions, out_indices reflects the input type of out_indices on the `create_model` call,
+        # in later versions >= 1, it is always a tuple
+        out_indices = list(self._backbone.feature_info.out_indices)
+        out_features = self._backbone.feature_info.module_name()
+
+        # We verify the out indices and out features are valid
+        verify_out_features_out_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self._out_features, self._out_indices = out_features, out_indices
+
+    def _init_transformers_backbone(self, config) -> None:
+        stage_names = getattr(config, "stage_names")
+        out_features = getattr(config, "out_features", None)
+        out_indices = getattr(config, "out_indices", None)
+
+        self.stage_names = stage_names
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=stage_names
+        )
+        # Number of channels for each stage. This is set in the transformer backbone model init
+        self.num_features = None
+
+    def _init_backbone(self, config) -> None:
+        """
+        Method to initialize the backbone. This method is called by the constructor of the base class after the
+        pretrained model weights have been loaded.
+        """
+        self.config = config
+
+        self.use_timm_backbone = getattr(config, "use_timm_backbone", False)
+        self.backbone_type = BackboneType.TIMM if self.use_timm_backbone else BackboneType.TRANSFORMERS
+
+        if self.backbone_type == BackboneType.TIMM:
+            self._init_timm_backbone(config)
+        elif self.backbone_type == BackboneType.TRANSFORMERS:
+            self._init_transformers_backbone(config)
+        else:
+            raise ValueError(f"backbone_type {self.backbone_type} not supported.")
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: list[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[tuple[int], list[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+    @property
+    def out_feature_channels(self):
+        # the current backbones will output the number of channels for each stage
+        # even if that stage is not in the out_features list.
+        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        signature = dict(inspect.signature(self.forward).parameters)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
+        return self(*args, **filtered_kwargs)
+
+    def forward(
+        self,
+        pixel_values,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        raise NotImplementedError("This method should be implemented by the derived class.")
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
+
+
+class BackboneConfigMixin:
+    """
+    A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
+    """
+
+    @property
+    def out_features(self):
+        return self._out_features
+
+    @out_features.setter
+    def out_features(self, out_features: list[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=None, stage_names=self.stage_names
+        )
+
+    @property
+    def out_indices(self):
+        return self._out_indices
+
+    @out_indices.setter
+    def out_indices(self, out_indices: Union[tuple[int], list[int]]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=None, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features")
+        output["out_indices"] = output.pop("_out_indices")
+        return output
+
+
+def load_backbone(config):
+    """
+    Loads the backbone model from a config object.
+
+    If the config is from the backbone model itself, then we return a backbone model with randomly initialized
+    weights.
+
+    If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights
+    if specified.
+    """
+    from transformers import AutoBackbone, AutoConfig
+
+    backbone_config = getattr(config, "backbone_config", None)
+    use_timm_backbone = getattr(config, "use_timm_backbone", None)
+    use_pretrained_backbone = getattr(config, "use_pretrained_backbone", None)
+    backbone_checkpoint = getattr(config, "backbone", None)
+    backbone_kwargs = getattr(config, "backbone_kwargs", None)
+    backbone_kwargs = {} if backbone_kwargs is None else backbone_kwargs
+
+    if backbone_kwargs and backbone_config is not None:
+        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+    # If there is a backbone_config and a backbone checkpoint, and use_pretrained_backbone=False then the desired
+    # behaviour is ill-defined: do you want to load from the checkpoint's config or the backbone_config?
+    if backbone_config is not None and backbone_checkpoint is not None and use_pretrained_backbone is not None:
+        raise ValueError("Cannot specify both config.backbone_config and config.backbone")
+
+    # If any of thhe following are set, then the config passed in is from a model which contains a backbone.
+    if (
+        backbone_config is None
+        and use_timm_backbone is None
+        and backbone_checkpoint is None
+        and backbone_checkpoint is None
+    ):
+        return AutoBackbone.from_config(config=config, **backbone_kwargs)
+
+    # config from the parent model that has a backbone
+    if use_timm_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_timm_backbone is True")
+        # Because of how timm backbones were originally added to models, we need to pass in use_pretrained_backbone
+        # to determine whether to load the pretrained weights.
+        backbone = AutoBackbone.from_pretrained(
+            backbone_checkpoint,
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            **backbone_kwargs,
+        )
+    elif use_pretrained_backbone:
+        if backbone_checkpoint is None:
+            raise ValueError("config.backbone must be set if use_pretrained_backbone is True")
+        backbone = AutoBackbone.from_pretrained(backbone_checkpoint, **backbone_kwargs)
+    else:
+        if backbone_config is None and backbone_checkpoint is None:
+            raise ValueError("Either config.backbone_config or config.backbone must be set")
+        if backbone_config is None:
+            backbone_config = AutoConfig.from_pretrained(backbone_checkpoint, **backbone_kwargs)
+        backbone = AutoBackbone.from_config(config=backbone_config)
+    return backbone
+
+
+def verify_backbone_config_arguments(
+    use_timm_backbone: bool,
+    use_pretrained_backbone: bool,
+    backbone: Optional[str],
+    backbone_config: Optional[Union[dict, "PretrainedConfig"]],
+    backbone_kwargs: Optional[dict],
+):
+    """
+    Verify that the config arguments to be passed to load_backbone are valid
+    """
+    if backbone_config is not None and backbone is not None:
+        raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+    if backbone_config is not None and use_timm_backbone:
+        raise ValueError("You can't specify both `backbone_config` and `use_timm_backbone`.")
+
+    if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+        raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
diff --git a/docs/transformers/build/lib/transformers/utils/bitsandbytes.py b/docs/transformers/build/lib/transformers/utils/bitsandbytes.py
new file mode 100644
index 0000000000000000000000000000000000000000..71707cf5659909f7e28f939e91df6c48e64aba43
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/bitsandbytes.py
@@ -0,0 +1,28 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+
+warnings.warn(
+    "transformers.utils.bitsandbytes module is deprecated and will be removed in a future version. Please import bitsandbytes modules directly from transformers.integrations",
+    FutureWarning,
+)
+
+from ..integrations import (  # noqa
+    get_keys_to_not_convert,
+    replace_8bit_linear,
+    replace_with_bnb_linear,
+    set_module_8bit_tensor_to_device,
+    set_module_quantized_tensor_to_device,
+)
diff --git a/docs/transformers/build/lib/transformers/utils/chat_template_utils.py b/docs/transformers/build/lib/transformers/utils/chat_template_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96b5d8ddecab445a6706d9562de3e1c6af43d0c
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/chat_template_utils.py
@@ -0,0 +1,435 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import json
+import re
+import types
+from contextlib import contextmanager
+from datetime import datetime
+from functools import lru_cache
+from typing import Any, Callable, Optional, Union, get_args, get_origin, get_type_hints
+
+from packaging import version
+
+from .import_utils import is_jinja_available, is_torch_available, is_vision_available
+
+
+if is_jinja_available():
+    import jinja2
+    from jinja2.ext import Extension
+    from jinja2.sandbox import ImmutableSandboxedEnvironment
+else:
+    jinja2 = None
+
+if is_vision_available():
+    from PIL.Image import Image
+
+if is_torch_available():
+    from torch import Tensor
+
+
+BASIC_TYPES = (int, float, str, bool, Any, type(None), ...)
+# Extracts the initial segment of the docstring, containing the function description
+description_re = re.compile(r"^(.*?)[\n\s]*(Args:|Returns:|Raises:|\Z)", re.DOTALL)
+# Extracts the Args: block from the docstring
+args_re = re.compile(r"\n\s*Args:\n\s*(.*?)[\n\s]*(Returns:|Raises:|\Z)", re.DOTALL)
+# Splits the Args: block into individual arguments
+args_split_re = re.compile(
+    r"""
+(?:^|\n)  # Match the start of the args block, or a newline
+\s*(\w+):\s*  # Capture the argument name and strip spacing
+(.*?)\s*  # Capture the argument description, which can span multiple lines, and strip trailing spacing
+(?=\n\s*\w+:|\Z)  # Stop when you hit the next argument or the end of the block
+""",
+    re.DOTALL | re.VERBOSE,
+)
+# Extracts the Returns: block from the docstring, if present. Note that most chat templates ignore the return type/doc!
+returns_re = re.compile(r"\n\s*Returns:\n\s*(.*?)[\n\s]*(Raises:|\Z)", re.DOTALL)
+
+
+class TypeHintParsingException(Exception):
+    """Exception raised for errors in parsing type hints to generate JSON schemas"""
+
+    pass
+
+
+class DocstringParsingException(Exception):
+    """Exception raised for errors in parsing docstrings to generate JSON schemas"""
+
+    pass
+
+
+def _get_json_schema_type(param_type: str) -> dict[str, str]:
+    type_mapping = {
+        int: {"type": "integer"},
+        float: {"type": "number"},
+        str: {"type": "string"},
+        bool: {"type": "boolean"},
+        type(None): {"type": "null"},
+        Any: {},
+    }
+    if is_vision_available():
+        type_mapping[Image] = {"type": "image"}
+    if is_torch_available():
+        type_mapping[Tensor] = {"type": "audio"}
+    return type_mapping.get(param_type, {"type": "object"})
+
+
+def _parse_type_hint(hint: str) -> dict:
+    origin = get_origin(hint)
+    args = get_args(hint)
+
+    if origin is None:
+        try:
+            return _get_json_schema_type(hint)
+        except KeyError:
+            raise TypeHintParsingException(
+                "Couldn't parse this type hint, likely due to a custom class or object: ", hint
+            )
+
+    elif origin is Union or (hasattr(types, "UnionType") and origin is types.UnionType):
+        # Recurse into each of the subtypes in the Union, except None, which is handled separately at the end
+        subtypes = [_parse_type_hint(t) for t in args if t is not type(None)]
+        if len(subtypes) == 1:
+            # A single non-null type can be expressed directly
+            return_dict = subtypes[0]
+        elif all(isinstance(subtype["type"], str) for subtype in subtypes):
+            # A union of basic types can be expressed as a list in the schema
+            return_dict = {"type": sorted([subtype["type"] for subtype in subtypes])}
+        else:
+            # A union of more complex types requires "anyOf"
+            return_dict = {"anyOf": subtypes}
+        if type(None) in args:
+            return_dict["nullable"] = True
+        return return_dict
+
+    elif origin is list:
+        if not args:
+            return {"type": "array"}
+        else:
+            # Lists can only have a single type argument, so recurse into it
+            return {"type": "array", "items": _parse_type_hint(args[0])}
+
+    elif origin is tuple:
+        if not args:
+            return {"type": "array"}
+        if len(args) == 1:
+            raise TypeHintParsingException(
+                f"The type hint {str(hint).replace('typing.', '')} is a Tuple with a single element, which "
+                "we do not automatically convert to JSON schema as it is rarely necessary. If this input can contain "
+                "more than one element, we recommend "
+                "using a List[] type instead, or if it really is a single element, remove the Tuple[] wrapper and just "
+                "pass the element directly."
+            )
+        if ... in args:
+            raise TypeHintParsingException(
+                "Conversion of '...' is not supported in Tuple type hints. "
+                "Use List[] types for variable-length"
+                " inputs instead."
+            )
+        return {"type": "array", "prefixItems": [_parse_type_hint(t) for t in args]}
+
+    elif origin is dict:
+        # The JSON equivalent to a dict is 'object', which mandates that all keys are strings
+        # However, we can specify the type of the dict values with "additionalProperties"
+        out = {"type": "object"}
+        if len(args) == 2:
+            out["additionalProperties"] = _parse_type_hint(args[1])
+        return out
+
+    raise TypeHintParsingException("Couldn't parse this type hint, likely due to a custom class or object: ", hint)
+
+
+def _convert_type_hints_to_json_schema(func: Callable) -> dict:
+    type_hints = get_type_hints(func)
+    signature = inspect.signature(func)
+    required = []
+    for param_name, param in signature.parameters.items():
+        if param.annotation == inspect.Parameter.empty:
+            raise TypeHintParsingException(f"Argument {param.name} is missing a type hint in function {func.__name__}")
+        if param.default == inspect.Parameter.empty:
+            required.append(param_name)
+
+    properties = {}
+    for param_name, param_type in type_hints.items():
+        properties[param_name] = _parse_type_hint(param_type)
+
+    schema = {"type": "object", "properties": properties}
+    if required:
+        schema["required"] = required
+
+    return schema
+
+
+def parse_google_format_docstring(docstring: str) -> tuple[Optional[str], Optional[dict], Optional[str]]:
+    """
+    Parses a Google-style docstring to extract the function description,
+    argument descriptions, and return description.
+
+    Args:
+        docstring (str): The docstring to parse.
+
+    Returns:
+        The function description, arguments, and return description.
+    """
+
+    # Extract the sections
+    description_match = description_re.search(docstring)
+    args_match = args_re.search(docstring)
+    returns_match = returns_re.search(docstring)
+
+    # Clean and store the sections
+    description = description_match.group(1).strip() if description_match else None
+    docstring_args = args_match.group(1).strip() if args_match else None
+    returns = returns_match.group(1).strip() if returns_match else None
+
+    # Parsing the arguments into a dictionary
+    if docstring_args is not None:
+        docstring_args = "\n".join([line for line in docstring_args.split("\n") if line.strip()])  # Remove blank lines
+        matches = args_split_re.findall(docstring_args)
+        args_dict = {match[0]: re.sub(r"\s*\n+\s*", " ", match[1].strip()) for match in matches}
+    else:
+        args_dict = {}
+
+    return description, args_dict, returns
+
+
+def get_json_schema(func: Callable) -> dict:
+    """
+    This function generates a JSON schema for a given function, based on its docstring and type hints. This is
+    mostly used for passing lists of tools to a chat template. The JSON schema contains the name and description of
+    the function, as well as the names, types and descriptions for each of its arguments. `get_json_schema()` requires
+    that the function has a docstring, and that each argument has a description in the docstring, in the standard
+    Google docstring format shown below. It also requires that all the function arguments have a valid Python type hint.
+
+    Although it is not required, a `Returns` block can also be added, which will be included in the schema. This is
+    optional because most chat templates ignore the return value of the function.
+
+    Args:
+        func: The function to generate a JSON schema for.
+
+    Returns:
+        A dictionary containing the JSON schema for the function.
+
+    Examples:
+    ```python
+    >>> def multiply(x: float, y: float):
+    >>>    '''
+    >>>    A function that multiplies two numbers
+    >>>
+    >>>    Args:
+    >>>        x: The first number to multiply
+    >>>        y: The second number to multiply
+    >>>    '''
+    >>>    return x * y
+    >>>
+    >>> print(get_json_schema(multiply))
+    {
+        "name": "multiply",
+        "description": "A function that multiplies two numbers",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "x": {"type": "number", "description": "The first number to multiply"},
+                "y": {"type": "number", "description": "The second number to multiply"}
+            },
+            "required": ["x", "y"]
+        }
+    }
+    ```
+
+    The general use for these schemas is that they are used to generate tool descriptions for chat templates that
+    support them, like so:
+
+    ```python
+    >>> from transformers import AutoTokenizer
+    >>> from transformers.utils import get_json_schema
+    >>>
+    >>> def multiply(x: float, y: float):
+    >>>    '''
+    >>>    A function that multiplies two numbers
+    >>>
+    >>>    Args:
+    >>>        x: The first number to multiply
+    >>>        y: The second number to multiply
+    >>>    return x * y
+    >>>    '''
+    >>>
+    >>> multiply_schema = get_json_schema(multiply)
+    >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+    >>> messages = [{"role": "user", "content": "What is 179 x 4571?"}]
+    >>> formatted_chat = tokenizer.apply_chat_template(
+    >>>     messages,
+    >>>     tools=[multiply_schema],
+    >>>     chat_template="tool_use",
+    >>>     return_dict=True,
+    >>>     return_tensors="pt",
+    >>>     add_generation_prompt=True
+    >>> )
+    >>> # The formatted chat can now be passed to model.generate()
+    ```
+
+    Each argument description can also have an optional `(choices: ...)` block at the end, such as
+    `(choices: ["tea", "coffee"])`, which will be parsed into an `enum` field in the schema. Note that this will
+    only be parsed correctly if it is at the end of the line:
+
+    ```python
+    >>> def drink_beverage(beverage: str):
+    >>>    '''
+    >>>    A function that drinks a beverage
+    >>>
+    >>>    Args:
+    >>>        beverage: The beverage to drink (choices: ["tea", "coffee"])
+    >>>    '''
+    >>>    pass
+    >>>
+    >>> print(get_json_schema(drink_beverage))
+    ```
+    {
+        'name': 'drink_beverage',
+        'description': 'A function that drinks a beverage',
+        'parameters': {
+            'type': 'object',
+            'properties': {
+                'beverage': {
+                    'type': 'string',
+                    'enum': ['tea', 'coffee'],
+                    'description': 'The beverage to drink'
+                    }
+                },
+            'required': ['beverage']
+        }
+    }
+    """
+    doc = inspect.getdoc(func)
+    if not doc:
+        raise DocstringParsingException(
+            f"Cannot generate JSON schema for {func.__name__} because it has no docstring!"
+        )
+    doc = doc.strip()
+    main_doc, param_descriptions, return_doc = parse_google_format_docstring(doc)
+
+    json_schema = _convert_type_hints_to_json_schema(func)
+    if (return_dict := json_schema["properties"].pop("return", None)) is not None:
+        if return_doc is not None:  # We allow a missing return docstring since most templates ignore it
+            return_dict["description"] = return_doc
+    for arg, schema in json_schema["properties"].items():
+        if arg not in param_descriptions:
+            raise DocstringParsingException(
+                f"Cannot generate JSON schema for {func.__name__} because the docstring has no description for the argument '{arg}'"
+            )
+        desc = param_descriptions[arg]
+        enum_choices = re.search(r"\(choices:\s*(.*?)\)\s*$", desc, flags=re.IGNORECASE)
+        if enum_choices:
+            schema["enum"] = [c.strip() for c in json.loads(enum_choices.group(1))]
+            desc = enum_choices.string[: enum_choices.start()].strip()
+        schema["description"] = desc
+
+    output = {"name": func.__name__, "description": main_doc, "parameters": json_schema}
+    if return_dict is not None:
+        output["return"] = return_dict
+    return {"type": "function", "function": output}
+
+
+def _render_with_assistant_indices(
+    compiled_template, messages, tools, documents, add_generation_prompt, **template_kwargs
+):
+    rendered_blocks = []
+    generation_indices = []
+    with compiled_template.environment.activate_tracker(rendered_blocks, generation_indices):
+        for block in compiled_template.generate(
+            messages=messages,
+            tools=tools,
+            documents=documents,
+            add_generation_prompt=add_generation_prompt,
+            **template_kwargs,
+        ):
+            rendered_blocks.append(block)
+        rendered_chat = "".join(rendered_blocks)
+    return rendered_chat, generation_indices
+
+
+@lru_cache
+def _compile_jinja_template(chat_template):
+    if not is_jinja_available():
+        raise ImportError(
+            "apply_chat_template requires jinja2 to be installed. Please install it using `pip install jinja2`."
+        )
+
+    class AssistantTracker(Extension):
+        # This extension is used to track the indices of assistant-generated tokens in the rendered chat
+        tags = {"generation"}
+
+        def __init__(self, environment: ImmutableSandboxedEnvironment):
+            # The class is only initiated by jinja.
+            super().__init__(environment)
+            environment.extend(activate_tracker=self.activate_tracker)
+            self._rendered_blocks = None
+            self._generation_indices = None
+
+        def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.CallBlock:
+            lineno = next(parser.stream).lineno
+            body = parser.parse_statements(["name:endgeneration"], drop_needle=True)
+            return jinja2.nodes.CallBlock(self.call_method("_generation_support"), [], [], body).set_lineno(lineno)
+
+        @jinja2.pass_eval_context
+        def _generation_support(self, context: jinja2.nodes.EvalContext, caller: jinja2.runtime.Macro) -> str:
+            rv = caller()
+            if self.is_active():
+                # Only track generation indices if the tracker is active
+                start_index = len("".join(self._rendered_blocks))
+                end_index = start_index + len(rv)
+                self._generation_indices.append((start_index, end_index))
+            return rv
+
+        def is_active(self) -> bool:
+            return self._rendered_blocks or self._generation_indices
+
+        @contextmanager
+        def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]):
+            try:
+                if self.is_active():
+                    raise ValueError("AssistantTracker should not be reused before closed")
+                self._rendered_blocks = rendered_blocks
+                self._generation_indices = generation_indices
+
+                yield
+            finally:
+                self._rendered_blocks = None
+                self._generation_indices = None
+
+    if version.parse(jinja2.__version__) < version.parse("3.1.0"):
+        raise ImportError(
+            f"apply_chat_template requires jinja2>=3.1.0 to be installed. Your version is {jinja2.__version__}."
+        )
+
+    def raise_exception(message):
+        raise jinja2.exceptions.TemplateError(message)
+
+    def tojson(x, ensure_ascii=False, indent=None, separators=None, sort_keys=False):
+        # We override the built-in tojson filter because Jinja's default filter escapes HTML characters
+        # We also expose some options like custom indents and separators
+        return json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
+
+    def strftime_now(format):
+        return datetime.now().strftime(format)
+
+    jinja_env = ImmutableSandboxedEnvironment(
+        trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols]
+    )
+    jinja_env.filters["tojson"] = tojson
+    jinja_env.globals["raise_exception"] = raise_exception
+    jinja_env.globals["strftime_now"] = strftime_now
+    return jinja_env.from_string(chat_template)
diff --git a/docs/transformers/build/lib/transformers/utils/constants.py b/docs/transformers/build/lib/transformers/utils/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..fefd1b4601da04e073ff2880099ccaf87d0b1666
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/constants.py
@@ -0,0 +1,6 @@
+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
diff --git a/docs/transformers/build/lib/transformers/utils/deprecation.py b/docs/transformers/build/lib/transformers/utils/deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..064decb14dba8f7a0a9eb860b450823c425ca43a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/deprecation.py
@@ -0,0 +1,176 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import warnings
+from functools import wraps
+from typing import Optional
+
+import packaging.version
+
+from .. import __version__
+from . import ExplicitEnum, is_torch_available, is_torchdynamo_compiling
+
+
+# This is needed in case we deprecate a kwarg of a function/method being compiled
+if is_torch_available():
+    import torch  # noqa: F401
+
+
+class Action(ExplicitEnum):
+    NONE = "none"
+    NOTIFY = "notify"
+    NOTIFY_ALWAYS = "notify_always"
+    RAISE = "raise"
+
+
+def deprecate_kwarg(
+    old_name: str,
+    version: str,
+    new_name: Optional[str] = None,
+    warn_if_greater_or_equal_version: bool = False,
+    raise_if_greater_or_equal_version: bool = False,
+    raise_if_both_names: bool = False,
+    additional_message: Optional[str] = None,
+):
+    """
+    Function or method decorator to notify users about deprecated keyword arguments, replacing them with a new name if specified.
+    Note that is decorator is `torch.compile`-safe, i.e. it will not cause graph breaks (but no warning will be displayed if compiling).
+
+    This decorator allows you to:
+    - Notify users when a keyword argument is deprecated.
+    - Automatically replace deprecated keyword arguments with new ones.
+    - Raise an error if deprecated arguments are used, depending on the specified conditions.
+
+    By default, the decorator notifies the user about the deprecated argument while the `transformers.__version__` < specified `version`
+    in the decorator. To keep notifications with any version `warn_if_greater_or_equal_version=True` can be set.
+
+    Parameters:
+        old_name (`str`):
+            Name of the deprecated keyword argument.
+        version (`str`):
+            The version in which the keyword argument was (or will be) deprecated.
+        new_name (`Optional[str]`, *optional*):
+            The new name for the deprecated keyword argument. If specified, the deprecated keyword argument will be replaced with this new name.
+        warn_if_greater_or_equal_version (`bool`, *optional*, defaults to `False`):
+            Whether to show warning if current `transformers` version is greater or equal to the deprecated version.
+        raise_if_greater_or_equal_version (`bool`, *optional*, defaults to `False`):
+            Whether to raise `ValueError` if current `transformers` version is greater or equal to the deprecated version.
+        raise_if_both_names (`bool`, *optional*, defaults to `False`):
+            Whether to raise `ValueError` if both deprecated and new keyword arguments are set.
+        additional_message (`Optional[str]`, *optional*):
+            An additional message to append to the default deprecation message.
+
+    Raises:
+        ValueError:
+            If raise_if_greater_or_equal_version is True and the current version is greater than or equal to the deprecated version, or if raise_if_both_names is True and both old and new keyword arguments are provided.
+
+    Returns:
+        Callable:
+            A wrapped function that handles the deprecated keyword arguments according to the specified parameters.
+
+    Example usage with renaming argument:
+
+        ```python
+        @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="6.0.0")
+        def my_function(do_reduce_labels):
+            print(do_reduce_labels)
+
+        my_function(reduce_labels=True)  # Will show a deprecation warning and use do_reduce_labels=True
+        ```
+
+    Example usage without renaming argument:
+
+        ```python
+        @deprecate_kwarg("max_size", version="6.0.0")
+        def my_function(max_size):
+            print(max_size)
+
+        my_function(max_size=1333)  # Will show a deprecation warning
+        ```
+
+    """
+
+    deprecated_version = packaging.version.parse(version)
+    current_version = packaging.version.parse(__version__)
+    is_greater_or_equal_version = current_version >= deprecated_version
+
+    if is_greater_or_equal_version:
+        version_message = f"and removed starting from version {version}"
+    else:
+        version_message = f"and will be removed in version {version}"
+
+    def wrapper(func):
+        # Required for better warning message
+        sig = inspect.signature(func)
+        function_named_args = set(sig.parameters.keys())
+        is_instance_method = "self" in function_named_args
+        is_class_method = "cls" in function_named_args
+
+        @wraps(func)
+        def wrapped_func(*args, **kwargs):
+            # Get class + function name (just for better warning message)
+            func_name = func.__name__
+            if is_instance_method:
+                func_name = f"{args[0].__class__.__name__}.{func_name}"
+            elif is_class_method:
+                func_name = f"{args[0].__name__}.{func_name}"
+
+            minimum_action = Action.NONE
+            message = None
+
+            # deprecated kwarg and its new version are set for function call -> replace it with new name
+            if old_name in kwargs and new_name in kwargs:
+                minimum_action = Action.RAISE if raise_if_both_names else Action.NOTIFY_ALWAYS
+                message = f"Both `{old_name}` and `{new_name}` are set for `{func_name}`. Using `{new_name}={kwargs[new_name]}` and ignoring deprecated `{old_name}={kwargs[old_name]}`."
+                kwargs.pop(old_name)
+
+            # only deprecated kwarg is set for function call -> replace it with new name
+            elif old_name in kwargs and new_name is not None and new_name not in kwargs:
+                minimum_action = Action.NOTIFY
+                message = f"`{old_name}` is deprecated {version_message} for `{func_name}`. Use `{new_name}` instead."
+                kwargs[new_name] = kwargs.pop(old_name)
+
+            # deprecated kwarg is not set for function call and new name is not specified -> just notify
+            elif old_name in kwargs:
+                minimum_action = Action.NOTIFY
+                message = f"`{old_name}` is deprecated {version_message} for `{func_name}`."
+
+            if message is not None and additional_message is not None:
+                message = f"{message} {additional_message}"
+
+            # update minimum_action if argument is ALREADY deprecated (current version >= deprecated version)
+            if is_greater_or_equal_version:
+                # change to (NOTIFY, NOTIFY_ALWAYS) -> RAISE if specified
+                # in case we want to raise error for already deprecated arguments
+                if raise_if_greater_or_equal_version and minimum_action != Action.NONE:
+                    minimum_action = Action.RAISE
+
+                # change to NOTIFY -> NONE if specified (NOTIFY_ALWAYS can't be changed to NONE)
+                # in case we want to ignore notifications for already deprecated arguments
+                elif not warn_if_greater_or_equal_version and minimum_action == Action.NOTIFY:
+                    minimum_action = Action.NONE
+
+            # raise error or notify user
+            if minimum_action == Action.RAISE:
+                raise ValueError(message)
+            # If we are compiling, we do not raise the warning as it would break compilation
+            elif minimum_action in (Action.NOTIFY, Action.NOTIFY_ALWAYS) and not is_torchdynamo_compiling():
+                # DeprecationWarning is ignored by default, so we use FutureWarning instead
+                warnings.warn(message, FutureWarning, stacklevel=2)
+
+            return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrapper
diff --git a/docs/transformers/build/lib/transformers/utils/doc.py b/docs/transformers/build/lib/transformers/utils/doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01ffc28442cd5a5dcda3ea55d3ea08eb249b54f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/doc.py
@@ -0,0 +1,1218 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Doc utilities: Utilities related to documentation
+"""
+
+import functools
+import inspect
+import re
+import textwrap
+import types
+
+
+def get_docstring_indentation_level(func):
+    """Return the indentation level of the start of the docstring of a class or function (or method)."""
+    # We assume classes are always defined in the global scope
+    if inspect.isclass(func):
+        return 4
+    source = inspect.getsource(func)
+    first_line = source.splitlines()[0]
+    function_def_level = len(first_line) - len(first_line.lstrip())
+    return 4 + function_def_level
+
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_start_docstrings_to_model_forward(*docstr):
+    def docstring_decorator(fn):
+        class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
+        intro = rf"""    The {class_name} forward method, overrides the `__call__` special method.
+
+    <Tip>
+
+    Although the recipe for forward pass needs to be defined within this function, one should call the [`Module`]
+    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
+    the latter silently ignores them.
+
+    </Tip>
+"""
+
+        correct_indentation = get_docstring_indentation_level(fn)
+        current_doc = fn.__doc__ if fn.__doc__ is not None else ""
+        try:
+            first_non_empty = next(line for line in current_doc.splitlines() if line.strip() != "")
+            doc_indentation = len(first_non_empty) - len(first_non_empty.lstrip())
+        except StopIteration:
+            doc_indentation = correct_indentation
+
+        docs = docstr
+        # In this case, the correct indentation level (class method, 2 Python levels) was respected, and we should
+        # correctly reindent everything. Otherwise, the doc uses a single indentation level
+        if doc_indentation == 4 + correct_indentation:
+            docs = [textwrap.indent(textwrap.dedent(doc), " " * correct_indentation) for doc in docstr]
+            intro = textwrap.indent(textwrap.dedent(intro), " " * correct_indentation)
+
+        docstring = "".join(docs) + current_doc
+        fn.__doc__ = intro + docstring
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+
+PT_RETURN_INTRODUCTION = r"""
+    Returns:
+        [`{full_output_type}`] or `tuple(torch.FloatTensor)`: A [`{full_output_type}`] or a tuple of
+        `torch.FloatTensor` (if `return_dict=False` is passed or when `config.return_dict=False`) comprising various
+        elements depending on the configuration ([`{config_class}`]) and inputs.
+
+"""
+
+
+TF_RETURN_INTRODUCTION = r"""
+    Returns:
+        [`{full_output_type}`] or `tuple(tf.Tensor)`: A [`{full_output_type}`] or a tuple of `tf.Tensor` (if
+        `return_dict=False` is passed or when `config.return_dict=False`) comprising various elements depending on the
+        configuration ([`{config_class}`]) and inputs.
+
+"""
+
+
+def _get_indent(t):
+    """Returns the indentation in the first line of t"""
+    search = re.search(r"^(\s*)\S", t)
+    return "" if search is None else search.groups()[0]
+
+
+def _convert_output_args_doc(output_args_doc):
+    """Convert output_args_doc to display properly."""
+    # Split output_arg_doc in blocks argument/description
+    indent = _get_indent(output_args_doc)
+    blocks = []
+    current_block = ""
+    for line in output_args_doc.split("\n"):
+        # If the indent is the same as the beginning, the line is the name of new arg.
+        if _get_indent(line) == indent:
+            if len(current_block) > 0:
+                blocks.append(current_block[:-1])
+            current_block = f"{line}\n"
+        else:
+            # Otherwise it's part of the description of the current arg.
+            # We need to remove 2 spaces to the indentation.
+            current_block += f"{line[2:]}\n"
+    blocks.append(current_block[:-1])
+
+    # Format each block for proper rendering
+    for i in range(len(blocks)):
+        blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i])
+        blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i])
+
+    return "\n".join(blocks)
+
+
+def _prepare_output_docstrings(output_type, config_class, min_indent=None):
+    """
+    Prepares the return part of the docstring using `output_type`.
+    """
+    output_docstring = output_type.__doc__
+
+    # Remove the head of the docstring to keep the list of args only
+    lines = output_docstring.split("\n")
+    i = 0
+    while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
+        i += 1
+    if i < len(lines):
+        params_docstring = "\n".join(lines[(i + 1) :])
+        params_docstring = _convert_output_args_doc(params_docstring)
+    else:
+        raise ValueError(
+            f"No `Args` or `Parameters` section is found in the docstring of `{output_type.__name__}`. Make sure it has "
+            "docstring and contain either `Args` or `Parameters`."
+        )
+
+    # Add the return introduction
+    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
+    intro = TF_RETURN_INTRODUCTION if output_type.__name__.startswith("TF") else PT_RETURN_INTRODUCTION
+    intro = intro.format(full_output_type=full_output_type, config_class=config_class)
+    result = intro + params_docstring
+
+    # Apply minimum indent if necessary
+    if min_indent is not None:
+        lines = result.split("\n")
+        # Find the indent of the first nonempty line
+        i = 0
+        while len(lines[i]) == 0:
+            i += 1
+        indent = len(_get_indent(lines[i]))
+        # If too small, add indentation to all nonempty lines
+        if indent < min_indent:
+            to_add = " " * (min_indent - indent)
+            lines = [(f"{to_add}{line}" if len(line) > 0 else line) for line in lines]
+            result = "\n".join(lines)
+
+    return result
+
+
+FAKE_MODEL_DISCLAIMER = """
+    <Tip warning={true}>
+
+    This example uses a random model as the real ones are all very big. To get proper results, you should use
+    {real_checkpoint} instead of {fake_checkpoint}. If you get out-of-memory when loading that checkpoint, you can try
+    adding `device_map="auto"` in the `from_pretrained` call.
+
+    </Tip>
+"""
+
+
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer(
+    ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
+    ... )
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> predicted_token_class_ids = logits.argmax(-1)
+
+    >>> # Note that tokens are classified rather then input words which means that
+    >>> # there might be more predicted token classes than words.
+    >>> # Multiple token classes might account for the same word
+    >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
+    >>> predicted_tokens_classes
+    {expected_output}
+
+    >>> labels = predicted_token_class_ids
+    >>> loss = model(**inputs, labels=labels).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+
+    >>> inputs = tokenizer(question, text, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     outputs = model(**inputs)
+
+    >>> answer_start_index = outputs.start_logits.argmax()
+    >>> answer_end_index = outputs.end_logits.argmax()
+
+    >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+    >>> tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
+    {expected_output}
+
+    >>> # target is "nice puppet"
+    >>> target_start_index = torch.tensor([{qa_target_start_index}])
+    >>> target_end_index = torch.tensor([{qa_target_end_index}])
+
+    >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
+    >>> loss = outputs.loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example of single-label classification:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> predicted_class_id = logits.argmax().item()
+    >>> model.config.id2label[predicted_class_id]
+    {expected_output}
+
+    >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+    >>> num_labels = len(model.config.id2label)
+    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels)
+
+    >>> labels = torch.tensor([1])
+    >>> loss = model(**inputs, labels=labels).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+
+    Example of multi-label classification:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]
+
+    >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+    >>> num_labels = len(model.config.id2label)
+    >>> model = {model_class}.from_pretrained(
+    ...     "{checkpoint}", num_labels=num_labels, problem_type="multi_label_classification"
+    ... )
+
+    >>> labels = torch.sum(
+    ...     torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
+    ... ).to(torch.float)
+    >>> loss = model(**inputs, labels=labels).loss
+    ```
+"""
+
+PT_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> # retrieve index of {mask}
+    >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
+
+    >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
+    >>> tokenizer.decode(predicted_token_id)
+    {expected_output}
+
+    >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
+    >>> # mask labels of non-{mask} tokens
+    >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
+
+    >>> outputs = model(**inputs, labels=labels)
+    >>> round(outputs.loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+PT_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+    >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
+    >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels)  # batch size is 1
+
+    >>> # the linear classifier still needs to be trained
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> import torch
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    >>> outputs = model(**inputs, labels=inputs["input_ids"])
+    >>> loss = outputs.loss
+    >>> logits = outputs.logits
+    ```
+"""
+
+PT_SPEECH_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+PT_SPEECH_CTC_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    >>> predicted_ids = torch.argmax(logits, dim=-1)
+
+    >>> # transcribe speech
+    >>> transcription = processor.batch_decode(predicted_ids)
+    >>> transcription[0]
+    {expected_output}
+
+    >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids
+
+    >>> # compute loss
+    >>> loss = model(**inputs).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoFeatureExtractor, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> predicted_class_ids = torch.argmax(logits, dim=-1).item()
+    >>> predicted_label = model.config.id2label[predicted_class_ids]
+    >>> predicted_label
+    {expected_output}
+
+    >>> # compute loss - target_label is e.g. "down"
+    >>> target_label = model.config.id2label[0]
+    >>> inputs["labels"] = torch.tensor([model.config.label2id[target_label]])
+    >>> loss = model(**inputs).loss
+    >>> round(loss.item(), 2)
+    {expected_loss}
+    ```
+"""
+
+
+PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoFeatureExtractor, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate)
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> probabilities = torch.sigmoid(logits[0])
+    >>> # labels is a one-hot array of shape (num_frames, num_speakers)
+    >>> labels = (probabilities > 0.5).long()
+    >>> labels[0].tolist()
+    {expected_output}
+    ```
+"""
+
+
+PT_SPEECH_XVECTOR_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoFeatureExtractor, {model_class}
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = feature_extractor(
+    ...     [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
+    ... )
+    >>> with torch.no_grad():
+    ...     embeddings = model(**inputs).embeddings
+
+    >>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
+
+    >>> # the resulting embeddings can be used for cosine similarity-based retrieval
+    >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
+    >>> similarity = cosine_sim(embeddings[0], embeddings[1])
+    >>> threshold = 0.7  # the optimal threshold is dataset-dependent
+    >>> if similarity < threshold:
+    ...     print("Speakers are not the same!")
+    >>> round(similarity.item(), 2)
+    {expected_output}
+    ```
+"""
+
+PT_VISION_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> image = dataset["test"]["image"][0]
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+PT_VISION_SEQ_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, {model_class}
+    >>> import torch
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> image = dataset["test"]["image"][0]
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_label = logits.argmax(-1).item()
+    >>> print(model.config.id2label[predicted_label])
+    {expected_output}
+    ```
+"""
+
+
+PT_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": PT_MASKED_LM_SAMPLE,
+    "LMHead": PT_CAUSAL_LM_SAMPLE,
+    "BaseModel": PT_BASE_MODEL_SAMPLE,
+    "SpeechBaseModel": PT_SPEECH_BASE_MODEL_SAMPLE,
+    "CTC": PT_SPEECH_CTC_SAMPLE,
+    "AudioClassification": PT_SPEECH_SEQ_CLASS_SAMPLE,
+    "AudioFrameClassification": PT_SPEECH_FRAME_CLASS_SAMPLE,
+    "AudioXVector": PT_SPEECH_XVECTOR_SAMPLE,
+    "VisionBaseModel": PT_VISION_BASE_MODEL_SAMPLE,
+    "ImageClassification": PT_VISION_SEQ_CLASS_SAMPLE,
+}
+
+
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer(
+    ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="tf"
+    ... )
+
+    >>> logits = model(**inputs).logits
+    >>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+
+    >>> # Note that tokens are classified rather then input words which means that
+    >>> # there might be more predicted token classes than words.
+    >>> # Multiple token classes might account for the same word
+    >>> predicted_tokens_classes = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+    >>> predicted_tokens_classes
+    {expected_output}
+    ```
+
+    ```python
+    >>> labels = predicted_token_class_ids
+    >>> loss = tf.math.reduce_mean(model(**inputs, labels=labels).loss)
+    >>> round(float(loss), 2)
+    {expected_loss}
+    ```
+"""
+
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+
+    >>> inputs = tokenizer(question, text, return_tensors="tf")
+    >>> outputs = model(**inputs)
+
+    >>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+    >>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+
+    >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+    >>> tokenizer.decode(predict_answer_tokens)
+    {expected_output}
+    ```
+
+    ```python
+    >>> # target is "nice puppet"
+    >>> target_start_index = tf.constant([{qa_target_start_index}])
+    >>> target_end_index = tf.constant([{qa_target_end_index}])
+
+    >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
+    >>> loss = tf.math.reduce_mean(outputs.loss)
+    >>> round(float(loss), 2)
+    {expected_loss}
+    ```
+"""
+
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+
+    >>> logits = model(**inputs).logits
+
+    >>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+    >>> model.config.id2label[predicted_class_id]
+    {expected_output}
+    ```
+
+    ```python
+    >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+    >>> num_labels = len(model.config.id2label)
+    >>> model = {model_class}.from_pretrained("{checkpoint}", num_labels=num_labels)
+
+    >>> labels = tf.constant(1)
+    >>> loss = model(**inputs, labels=labels).loss
+    >>> round(float(loss), 2)
+    {expected_loss}
+    ```
+"""
+
+TF_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
+    >>> logits = model(**inputs).logits
+
+    >>> # retrieve index of {mask}
+    >>> mask_token_index = tf.where((inputs.input_ids == tokenizer.mask_token_id)[0])
+    >>> selected_logits = tf.gather_nd(logits[0], indices=mask_token_index)
+
+    >>> predicted_token_id = tf.math.argmax(selected_logits, axis=-1)
+    >>> tokenizer.decode(predicted_token_id)
+    {expected_output}
+    ```
+
+    ```python
+    >>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
+    >>> # mask labels of non-{mask} tokens
+    >>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
+
+    >>> outputs = model(**inputs, labels=labels)
+    >>> round(float(outputs.loss), 2)
+    {expected_loss}
+    ```
+"""
+
+TF_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> outputs = model(inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
+    >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+    >>> outputs = model(inputs)  # batch size is 1
+
+    >>> # the linear classifier still needs to be trained
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+    >>> outputs = model(inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+TF_SPEECH_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, {model_class}
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+TF_SPEECH_CTC_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoProcessor, {model_class}
+    >>> from datasets import load_dataset
+    >>> import tensorflow as tf
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True)
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = AutoProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="tf")
+    >>> logits = model(**inputs).logits
+    >>> predicted_ids = tf.math.argmax(logits, axis=-1)
+
+    >>> # transcribe speech
+    >>> transcription = processor.batch_decode(predicted_ids)
+    >>> transcription[0]
+    {expected_output}
+    ```
+
+    ```python
+    >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="tf").input_ids
+
+    >>> # compute loss
+    >>> loss = model(**inputs).loss
+    >>> round(float(loss), 2)
+    {expected_loss}
+    ```
+"""
+
+TF_VISION_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, {model_class}
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> image = dataset["test"]["image"][0]
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(image, return_tensors="tf")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    >>> list(last_hidden_states.shape)
+    {expected_output}
+    ```
+"""
+
+TF_VISION_SEQ_CLASS_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoImageProcessor, {model_class}
+    >>> import tensorflow as tf
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
+    >>> image = dataset["test"]["image"][0]
+
+    >>> image_processor = AutoImageProcessor.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(image, return_tensors="tf")
+    >>> logits = model(**inputs).logits
+
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_label = int(tf.math.argmax(logits, axis=-1))
+    >>> print(model.config.id2label[predicted_label])
+    {expected_output}
+    ```
+"""
+
+TF_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": TF_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": TF_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": TF_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": TF_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": TF_MASKED_LM_SAMPLE,
+    "LMHead": TF_CAUSAL_LM_SAMPLE,
+    "BaseModel": TF_BASE_MODEL_SAMPLE,
+    "SpeechBaseModel": TF_SPEECH_BASE_MODEL_SAMPLE,
+    "CTC": TF_SPEECH_CTC_SAMPLE,
+    "VisionBaseModel": TF_VISION_BASE_MODEL_SAMPLE,
+    "ImageClassification": TF_VISION_SEQ_CLASS_SAMPLE,
+}
+
+
+FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_QUESTION_ANSWERING_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+    >>> inputs = tokenizer(question, text, return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> start_scores = outputs.start_logits
+    >>> end_scores = outputs.end_logits
+    ```
+"""
+
+FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_MASKED_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
+
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_BASE_MODEL_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
+    >>> outputs = model(**inputs)
+
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
+"""
+
+FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> choice0 = "It is eaten with a fork and a knife."
+    >>> choice1 = "It is eaten while held in the hand."
+
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
+    >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
+
+    >>> logits = outputs.logits
+    ```
+"""
+
+FLAX_CAUSAL_LM_SAMPLE = r"""
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, {model_class}
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+
+    >>> # retrieve logts for next token
+    >>> next_token_logits = outputs.logits[:, -1]
+    ```
+"""
+
+FLAX_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": FLAX_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": FLAX_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": FLAX_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": FLAX_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": FLAX_MASKED_LM_SAMPLE,
+    "BaseModel": FLAX_BASE_MODEL_SAMPLE,
+    "LMHead": FLAX_CAUSAL_LM_SAMPLE,
+}
+
+
+def filter_outputs_from_example(docstring, **kwargs):
+    """
+    Removes the lines testing an output with the doctest syntax in a code sample when it's set to `None`.
+    """
+    for key, value in kwargs.items():
+        if value is not None:
+            continue
+
+        doc_key = "{" + key + "}"
+        docstring = re.sub(rf"\n([^\n]+)\n\s+{doc_key}\n", "\n", docstring)
+
+    return docstring
+
+
+def add_code_sample_docstrings(
+    *docstr,
+    processor_class=None,
+    checkpoint=None,
+    output_type=None,
+    config_class=None,
+    mask="[MASK]",
+    qa_target_start_index=14,
+    qa_target_end_index=15,
+    model_cls=None,
+    modality=None,
+    expected_output=None,
+    expected_loss=None,
+    real_checkpoint=None,
+    revision=None,
+):
+    def docstring_decorator(fn):
+        # model_class defaults to function's class if not specified otherwise
+        model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls
+
+        if model_class[:2] == "TF":
+            sample_docstrings = TF_SAMPLE_DOCSTRINGS
+        elif model_class[:4] == "Flax":
+            sample_docstrings = FLAX_SAMPLE_DOCSTRINGS
+        else:
+            sample_docstrings = PT_SAMPLE_DOCSTRINGS
+
+        # putting all kwargs for docstrings in a dict to be used
+        # with the `.format(**doc_kwargs)`. Note that string might
+        # be formatted with non-existing keys, which is fine.
+        doc_kwargs = {
+            "model_class": model_class,
+            "processor_class": processor_class,
+            "checkpoint": checkpoint,
+            "mask": mask,
+            "qa_target_start_index": qa_target_start_index,
+            "qa_target_end_index": qa_target_end_index,
+            "expected_output": expected_output,
+            "expected_loss": expected_loss,
+            "real_checkpoint": real_checkpoint,
+            "fake_checkpoint": checkpoint,
+            "true": "{true}",  # For <Tip warning={true}> syntax that conflicts with formatting.
+        }
+
+        if ("SequenceClassification" in model_class or "AudioClassification" in model_class) and modality == "audio":
+            code_sample = sample_docstrings["AudioClassification"]
+        elif "SequenceClassification" in model_class:
+            code_sample = sample_docstrings["SequenceClassification"]
+        elif "QuestionAnswering" in model_class:
+            code_sample = sample_docstrings["QuestionAnswering"]
+        elif "TokenClassification" in model_class:
+            code_sample = sample_docstrings["TokenClassification"]
+        elif "MultipleChoice" in model_class:
+            code_sample = sample_docstrings["MultipleChoice"]
+        elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]:
+            code_sample = sample_docstrings["MaskedLM"]
+        elif "LMHead" in model_class or "CausalLM" in model_class:
+            code_sample = sample_docstrings["LMHead"]
+        elif "CTC" in model_class:
+            code_sample = sample_docstrings["CTC"]
+        elif "AudioFrameClassification" in model_class:
+            code_sample = sample_docstrings["AudioFrameClassification"]
+        elif "XVector" in model_class and modality == "audio":
+            code_sample = sample_docstrings["AudioXVector"]
+        elif "Model" in model_class and modality == "audio":
+            code_sample = sample_docstrings["SpeechBaseModel"]
+        elif "Model" in model_class and modality == "vision":
+            code_sample = sample_docstrings["VisionBaseModel"]
+        elif "Model" in model_class or "Encoder" in model_class:
+            code_sample = sample_docstrings["BaseModel"]
+        elif "ImageClassification" in model_class:
+            code_sample = sample_docstrings["ImageClassification"]
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+
+        code_sample = filter_outputs_from_example(
+            code_sample, expected_output=expected_output, expected_loss=expected_loss
+        )
+        if real_checkpoint is not None:
+            code_sample = FAKE_MODEL_DISCLAIMER + code_sample
+        func_doc = (fn.__doc__ or "") + "".join(docstr)
+        output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
+        built_doc = code_sample.format(**doc_kwargs)
+        if revision is not None:
+            if re.match(r"^refs/pr/\\d+", revision):
+                raise ValueError(
+                    f"The provided revision '{revision}' is incorrect. It should point to"
+                    " a pull request reference on the hub like 'refs/pr/6'"
+                )
+            built_doc = built_doc.replace(
+                f'from_pretrained("{checkpoint}")', f'from_pretrained("{checkpoint}", revision="{revision}")'
+            )
+
+        fn.__doc__ = func_doc + output_doc + built_doc
+        return fn
+
+    return docstring_decorator
+
+
+def replace_return_docstrings(output_type=None, config_class=None):
+    def docstring_decorator(fn):
+        func_doc = fn.__doc__
+        lines = func_doc.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            indent = len(_get_indent(lines[i]))
+            lines[i] = _prepare_output_docstrings(output_type, config_class, min_indent=indent)
+            func_doc = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, "
+                f"current docstring is:\n{func_doc}"
+            )
+        fn.__doc__ = func_doc
+        return fn
+
+    return docstring_decorator
+
+
+def copy_func(f):
+    """Returns a copy of a function f."""
+    # Based on http://stackoverflow.com/a/6528148/190597 (Glenn Maynard)
+    g = types.FunctionType(f.__code__, f.__globals__, name=f.__name__, argdefs=f.__defaults__, closure=f.__closure__)
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    return g
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_detectron2_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_detectron2_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..22ec32fe30a1b9abbc5d810510d97d5c81082561
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_detectron2_objects.py
@@ -0,0 +1,11 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import requires_backends
+
+
+class LayoutLMv2Model:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["detectron2"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["detectron2"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d75a6ec22e90427c972a753a24afd1a780758f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects.py
@@ -0,0 +1,23 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class Pop2PianoFeatureExtractor(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
+
+
+class Pop2PianoTokenizer(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
+
+
+class Pop2PianoProcessor(metaclass=DummyObject):
+    _backends = ["essentia", "librosa", "pretty_midi", "scipy", "torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["essentia", "librosa", "pretty_midi", "scipy", "torch"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_flax_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_flax_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f886de2824636a6a5e58ee28423a006fd006a8e
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_flax_objects.py
@@ -0,0 +1,107 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxForcedEOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxForceTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxGenerationMixin(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLogitsProcessorList(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLogitsWarper(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxMinLengthLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxSuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxSuppressTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxTemperatureLogitsWarper(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxTopKLogitsWarper(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxTopPLogitsWarper(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxWhisperTimeStampLogitsProcessor(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_music_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_music_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..89052be47c1d32bac5cbd6fceab183fc1d75d3bf
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_music_objects.py
@@ -0,0 +1,16 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class Pop2PianoFeatureExtractor(metaclass=DummyObject):
+    _backends = ["music"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["music"])
+
+
+class Pop2PianoTokenizer(metaclass=DummyObject):
+    _backends = ["music"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["music"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_pt_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_pt_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7291969ebba2c961050641a17a76e22bd5cdb8b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_pt_objects.py
@@ -0,0 +1,646 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class Cache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class CacheConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DynamicCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncoderDecoderCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HQQQuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HybridCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MambaCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OffloadedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class OffloadedStaticCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantizedCacheConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class QuantoQuantizedCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SinkCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SlidingWindowCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class StaticCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GlueDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GlueDataTrainingArguments(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LineByLineTextDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LineByLineWithRefDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LineByLineWithSOPTextDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SquadDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SquadDataTrainingArguments(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextDataset(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TextDatasetForNextSentencePrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class AlternatingCodebooksLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BayesianDetectorConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BayesianDetectorModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BeamScorer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class BeamSearchScorer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConstrainedBeamSearchScorer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Constraint(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ConstraintListState(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DisjunctiveConstraint(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncoderNoRepeatNGramLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EncoderRepetitionPenaltyLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EosTokenCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EpsilonLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class EtaLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ExponentialDecayLengthPenalty(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class ForcedEOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class GenerationMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class HammingDiversityLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class InfNanRemoveLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LogitNormalization(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LogitsProcessorList(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MaxLengthCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MaxTimeCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MinLengthLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MinNewTokensLengthLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MinPLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NoBadWordsLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class NoRepeatNGramLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PhrasalConstraint(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PrefixConstrainedLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RepetitionPenaltyLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SequenceBiasLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class StoppingCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class StoppingCriteriaList(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class StopStringCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SuppressTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SynthIDTextWatermarkDetector(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SynthIDTextWatermarkingConfig(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SynthIDTextWatermarkLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TemperatureLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TopKLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TopPLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TypicalLogitsWarper(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class UnbatchedClassifierFreeGuidanceLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WatermarkDetector(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WatermarkLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class WhisperTimeStampLogitsProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TorchExportableModuleWithStaticCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def convert_and_export_with_cache(*args, **kwargs):
+    requires_backends(convert_and_export_with_cache, ["torch"])
+
+
+def model_addition_debugger_context(*args, **kwargs):
+    requires_backends(model_addition_debugger_context, ["torch"])
+
+
+class GradientCheckpointingLayer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+ROPE_INIT_FUNCTIONS = None
+
+
+def dynamic_rope_update(*args, **kwargs):
+    requires_backends(dynamic_rope_update, ["torch"])
+
+
+class AttentionInterface(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class Adafactor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def get_constant_schedule(*args, **kwargs):
+    requires_backends(get_constant_schedule, ["torch"])
+
+
+def get_constant_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_constant_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
+
+
+def get_inverse_sqrt_schedule(*args, **kwargs):
+    requires_backends(get_inverse_sqrt_schedule, ["torch"])
+
+
+def get_linear_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_linear_schedule_with_warmup, ["torch"])
+
+
+def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch"])
+
+
+def get_scheduler(*args, **kwargs):
+    requires_backends(get_scheduler, ["torch"])
+
+
+def get_wsd_schedule(*args, **kwargs):
+    requires_backends(get_wsd_schedule, ["torch"])
+
+
+class Conv1D(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def apply_chunking_to_forward(*args, **kwargs):
+    requires_backends(apply_chunking_to_forward, ["torch"])
+
+
+def prune_layer(*args, **kwargs):
+    requires_backends(prune_layer, ["torch"])
+
+
+class Trainer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def torch_distributed_zero_first(*args, **kwargs):
+    requires_backends(torch_distributed_zero_first, ["torch"])
+
+
+class Seq2SeqTrainer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..38775330a81d91030f000e58c0e6035bba1c0f31
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_and_tokenizers_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+SLOW_TO_FAST_CONVERTERS = None
+
+
+def convert_slow_tokenizer(*args, **kwargs):
+    requires_backends(convert_slow_tokenizer, ["sentencepiece", "tokenizers"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..7931e0fe6584bbe85ed966911a14bc57c0d43c48
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_sentencepiece_objects.py
@@ -0,0 +1,254 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AlbertTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class BarthezTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class BartphoTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class BertGenerationTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class BigBirdTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class CamembertTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class CodeLlamaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class CpmTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class DebertaV2Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class ErnieMTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class XLMProphetNetTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class FNetTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class GemmaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class GPTSw3Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class LayoutXLMTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class LlamaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class M2M100Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class MarianTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class MBartTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class MBart50Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class MLukeTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class MT5Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class NllbTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class PegasusTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class PLBartTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class ReformerTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class RemBertTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class SeamlessM4TTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class SiglipTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class Speech2TextTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class SpeechT5Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class T5Tokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class UdopTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class XGLMTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class XLMRobertaTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
+
+
+class XLNetTokenizer(metaclass=DummyObject):
+    _backends = ["sentencepiece"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["sentencepiece"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_speech_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_speech_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bf08ebea42b4595ae1f8bbc2afcddf0630dcf4b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_speech_objects.py
@@ -0,0 +1,16 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class ASTFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
+
+
+class Speech2TextFeatureExtractor(metaclass=DummyObject):
+    _backends = ["speech"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["speech"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_tensorflow_text_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_tensorflow_text_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..70c7ad5cbf4077609e36592566e461c1a1ded28a
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_tensorflow_text_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class TFBertTokenizer(metaclass=DummyObject):
+    _backends = ["tensorflow_text"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tensorflow_text"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_tf_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_tf_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7b6f505df505966d8a2522dcc846dca2b53ea9
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_tf_objects.py
@@ -0,0 +1,178 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class TFForcedBOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFForcedEOSTokenLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFForceTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFGenerationMixin(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLogitsProcessorList(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLogitsWarper(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFMinLengthLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFNoBadWordsLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFNoRepeatNGramLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFRepetitionPenaltyLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSuppressTokensLogitsProcessor(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTemperatureLogitsWarper(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTopKLogitsWarper(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTopPLogitsWarper(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class KerasMetricCallback(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class PushToHubCallback(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSequenceSummary(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSharedEmbeddings(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+def shape_list(*args, **kwargs):
+    requires_backends(shape_list, ["tf"])
+
+
+class AdamWeightDecay(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class GradientAccumulator(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class WarmUp(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+def create_optimizer(*args, **kwargs):
+    requires_backends(create_optimizer, ["tf"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_timm_and_torchvision_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_timm_and_torchvision_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b67b5dac58db13278706e0838aebbe495a9d2fc
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_timm_and_torchvision_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class TimmWrapperImageProcessor(metaclass=DummyObject):
+    _backends = ["timm", "torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["timm", "torchvision"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_tokenizers_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_tokenizers_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e592f28570fedf4870d185e92d504fa82f6b545
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_tokenizers_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class PreTrainedTokenizerFast(metaclass=DummyObject):
+    _backends = ["tokenizers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_torchaudio_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_torchaudio_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7aea4a70207a2ce26385868b4437ea478c9efc5
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_torchaudio_objects.py
@@ -0,0 +1,30 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class GraniteSpeechFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])
+
+
+class GraniteSpeechProcessor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])
+
+
+class MusicgenMelodyFeatureExtractor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])
+
+
+class MusicgenMelodyProcessor(metaclass=DummyObject):
+    _backends = ["torchaudio"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchaudio"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_torchvision_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_torchvision_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ef4793c808bce52adaae3fd87f5d1ba2c7150d
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_torchvision_objects.py
@@ -0,0 +1,9 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class BaseImageProcessorFast(metaclass=DummyObject):
+    _backends = ["torchvision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torchvision"])
diff --git a/docs/transformers/build/lib/transformers/utils/dummy_vision_objects.py b/docs/transformers/build/lib/transformers/utils/dummy_vision_objects.py
new file mode 100644
index 0000000000000000000000000000000000000000..89aa3ad35872491b96bc102250d82425a4fc7766
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/dummy_vision_objects.py
@@ -0,0 +1,23 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class ImageProcessingMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class BaseImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
+class ImageFeatureExtractionMixin(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
diff --git a/docs/transformers/build/lib/transformers/utils/fx.py b/docs/transformers/build/lib/transformers/utils/fx.py
new file mode 100644
index 0000000000000000000000000000000000000000..4063d90c6aadaef1b155dc53f4473e60f1524cdf
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/fx.py
@@ -0,0 +1,1503 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import builtins
+import collections
+import contextlib
+import functools
+import inspect
+import math
+import operator
+import os
+import random
+import sys
+import warnings
+from typing import Any, Callable, Literal, Optional, Union
+
+import torch
+import torch.utils._pytree as pytree
+from torch import nn
+from torch.fx import Graph, GraphModule, Node, Proxy, Tracer
+from torch.fx._compatibility import compatibility
+from torch.fx._symbolic_trace import is_fx_tracing
+from torch.fx.proxy import ParameterProxy
+
+from .. import logging
+from ..cache_utils import Cache, DynamicCache, SinkCache, StaticCache
+from ..modeling_utils import PretrainedConfig, PreTrainedModel
+from ..models.auto import get_values
+from ..models.auto.modeling_auto import (
+    MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_BACKBONE_MAPPING_NAMES,
+    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_CTC_MAPPING_NAMES,
+    MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_IMAGE_MAPPING_NAMES,
+    MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
+    MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+    MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+    MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
+    MODEL_FOR_PRETRAINING_MAPPING_NAMES,
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+    MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+    MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+    MODEL_MAPPING_NAMES,
+)
+from .import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    is_peft_available,
+)
+
+
+if is_peft_available():
+    from peft import PeftModel
+
+
+logger = logging.get_logger(__name__)
+_IS_IN_DEBUG_MODE = os.environ.get("FX_DEBUG_MODE", "").upper() in ENV_VARS_TRUE_VALUES
+
+
+def _generate_supported_model_class_names(
+    model_name: type[PretrainedConfig],
+    supported_tasks: Optional[Union[str, list[str]]] = None,
+) -> list[str]:
+    task_mapping = {
+        "default": MODEL_MAPPING_NAMES,
+        "pretraining": MODEL_FOR_PRETRAINING_MAPPING_NAMES,
+        "next-sentence-prediction": MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES,
+        "masked-lm": MODEL_FOR_MASKED_LM_MAPPING_NAMES,
+        "causal-lm": MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
+        "seq2seq-lm": MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        "speech-seq2seq": MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
+        "multiple-choice": MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES,
+        "document-question-answering": MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES,
+        "question-answering": MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES,
+        "sequence-classification": MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES,
+        "token-classification": MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+        "masked-image-modeling": MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
+        "image-classification": MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+        "zero-shot-image-classification": MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES,
+        "ctc": MODEL_FOR_CTC_MAPPING_NAMES,
+        "audio-classification": MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
+        "semantic-segmentation": MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        "backbone": MODEL_FOR_BACKBONE_MAPPING_NAMES,
+        "image-feature-extraction": MODEL_FOR_IMAGE_MAPPING_NAMES,
+    }
+
+    if supported_tasks is None:
+        supported_tasks = task_mapping.keys()
+    if isinstance(supported_tasks, str):
+        supported_tasks = [supported_tasks]
+
+    model_class_names = []
+    for task in supported_tasks:
+        class_name = task_mapping[task].get(model_name, None)
+        if class_name:
+            model_class_names.append(class_name)
+
+    return model_class_names
+
+
+_REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS = [
+    "altclip",
+    "albert",
+    "bart",
+    "bert",
+    "blenderbot",
+    "blenderbot-small",
+    "bloom",
+    "clip",
+    "convnext",
+    "deberta",
+    "deberta-v2",
+    "dinov2",
+    "distilbert",
+    "donut-swin",
+    "electra",
+    "gpt2",
+    "gpt_neo",
+    "gptj",
+    "hiera",
+    "hubert",
+    "ijepa",
+    "layoutlm",
+    "llama",
+    "cohere",
+    "lxmert",
+    "m2m_100",
+    "marian",
+    "mbart",
+    "megatron-bert",
+    "mistral",
+    "mixtral",
+    "mobilebert",
+    "mt5",
+    "nezha",
+    "opt",
+    "pegasus",
+    "plbart",
+    "qwen2",
+    "qwen2_moe",
+    "qwen3",
+    "qwen3_moe",
+    "resnet",
+    "roberta",
+    "segformer",
+    "speech_to_text",
+    "speech_to_text_2",
+    "swin",
+    "t5",
+    "trocr",
+    "vit",
+    "xglm",
+    "wav2vec2",
+    #    "xlnet",
+]
+
+_FX_SUPPORTED_MODELS_WITH_KV_CACHE = ["llama", "opt"]
+
+_REGULAR_SUPPORTED_MODELS = []
+for item in _REGULAR_SUPPORTED_MODEL_NAMES_AND_TASKS:
+    if isinstance(item, dict):
+        _REGULAR_SUPPORTED_MODELS.extend(_generate_supported_model_class_names(**item))
+    else:
+        _REGULAR_SUPPORTED_MODELS.extend(_generate_supported_model_class_names(item))
+
+_SPECIAL_SUPPORTED_MODELS = [
+    "CLIPTextModel",
+    "CLIPTextModelWithProjection",
+    "CLIPVisionModel",
+    "CLIPVisionModelWithProjection",
+    "AltCLIPTextModel",
+    "AltCLIPVisionModel",
+    "GitVisionModel",
+    "GPT2DoubleHeadsModel",
+    "Speech2Text2Decoder",
+    "TrOCRDecoder",
+    "PeftModelForCausalLM",
+    "PeftModelForSeq2SeqLM",
+    # TODO: add support for them as it should be quite easy to do so (small blocking issues).
+    # XLNetForQuestionAnswering,
+]
+_SUPPORTED_MODELS = tuple(sorted(set(_REGULAR_SUPPORTED_MODELS + _SPECIAL_SUPPORTED_MODELS)))
+
+_CURRENT_TRACER = None
+
+
+def torch_nn_embedding(self, input):
+    return torch.empty(*input.shape, self.weight.shape[-1], device="meta", dtype=self.weight.dtype)
+
+
+def torch_nn_functional_embedding(
+    input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False
+):
+    return torch.empty(*input.shape, weight.shape[-1], device="meta", dtype=weight.dtype)
+
+
+def torch_nn_layernorm(self, input):
+    return input
+
+
+def torch_nn_groupnorm(self, input):
+    return input
+
+
+def torch_nn_linear(self, input):
+    return torch.empty(input.shape[:-1] + (self.out_features,), device="meta")
+
+
+def torch_relu(x):
+    return x
+
+
+def torch_nn_relu(self, x):
+    return x
+
+
+def torch_nn_functional_relu(x, inplace=False):
+    if not inplace:
+        raise ValueError("Don't support in-place functional.relu for MetaTensor analysis")
+    return x
+
+
+def torch_where(condition, x, y):
+    # torch.where returns the broadcasted tensor of condition, x, and y,
+    # so hack it by using addition
+    return condition.to(device="meta") + x.to(device="meta") + y.to(device="meta")
+
+
+def torch_abs(input, *, out=None):
+    if out is not None:
+        raise ValueError("Don't support in-place abs for MetaTensor analysis")
+    return input
+
+
+def torch_arange(*args, **kwargs):
+    n = len(args)
+    step = 1
+    if n == 1:
+        start = 0
+        end = args[0]
+    elif n == 2:
+        start, end = args
+    else:
+        start, end, step = args
+    if isinstance(start, float):
+        start = int(start)
+    if isinstance(end, float):
+        start = int(end)
+    if isinstance(step, float):
+        step = int(step)
+    step = kwargs.get("step", step)
+    dtype = kwargs.get("dtype")
+    return torch.empty((end - start) // step, dtype=dtype, device="meta")
+
+
+def torch_full(*args, **kwargs):
+    args = list(args)
+    # We set the fill value to 1 as its value is not important as long as it's not a tensor on the `meta` device.
+    if len(args) > 1:
+        args[1] = 1
+    else:
+        kwargs["fill_value"] = 1
+    kwargs_without_device = dict(kwargs)
+    kwargs_without_device.pop("device", None)
+    return torch.full(*args, **kwargs_without_device, device="meta")
+
+
+def torch_cat(tensors, dim=None, axis=None, *, out=None):
+    if dim is None and axis is None:
+        dim = 0
+    if dim is None and axis is not None:
+        dim = axis
+    if dim < 0:
+        dim = tensors[0].dim() + dim
+    shapes = [t.shape for t in tensors]
+    shape = list(shapes[0])
+    concatenated_dim = sum(shape[dim] for shape in shapes)
+    final_shape = shape[:dim] + [concatenated_dim] + shape[dim + 1 :]
+    return torch.empty(final_shape, device="meta")
+
+
+def torch_stack(tensors, dim=None, axis=None, *, out=None):
+    if dim is None and axis is None:
+        dim = 0
+    if dim is None and axis is not None:
+        dim = axis
+    if dim < 0:
+        dim = tensors[0].dim() + 1 + dim
+    shape = list(tensors[0].shape)
+    shape.insert(dim, len(tensors))
+    return torch.empty(shape, device="meta")
+
+
+def torch_add(input, other, *, alpha=1, out=None):
+    if not isinstance(input, torch.Tensor):
+        return torch.empty_like(other, device="meta")
+    if not isinstance(other, torch.Tensor):
+        return torch.empty_like(input, device="meta")
+    max_length = max(input.dim(), other.dim())
+    input_shape = list(input.shape) + [1] * (max_length - input.dim())
+    other_shape = list(other.shape) + [1] * (max_length - other.dim())
+    shape = []
+    for i in range(max_length):
+        shape.append(max(input_shape[i], other_shape[i]))
+    return torch.empty(shape, device="meta")
+
+
+def torch_mul(input, other, *, out=None):
+    return torch_add(input, other, out=out)
+
+
+def torch_tensor_mul(self, other):
+    return torch_mul(self, other)
+
+
+def torch_matmul(input, other, *, out=None):
+    d1 = input.dim()
+    d2 = other.dim()
+    shape = None
+    if d1 == 1 and d2 == 1:
+        shape = None
+    elif d1 == 2 and d2 == 2:
+        shape = (input.size(0), other.size(1))
+    elif d1 == 1 and d2 == 2:
+        shape = (other.size(1),)
+    elif d1 == 2 and d1 == 1:
+        shape = (input.size(0),)
+    else:
+        max_length = max(input.dim(), other.dim())
+        shape1 = list(input.shape)
+        shape2 = list(other.shape)
+        if d1 == 1:
+            shape1 = [1] + shape1
+        if d2 == 1:
+            shape2.append(1)
+        shape1 = [-1] * (max_length - d1) + list(input.shape)
+        shape2 = [-1] * (max_length - d2) + list(other.shape)
+        shape = []
+        for i in range(max_length):
+            shape.append(max(shape1[i], shape2[i]))
+        shape[-2] = shape1[-2]
+        shape[-1] = shape2[-1]
+        if d1 == 1:
+            shape.pop(-2)
+        if d2 == 1:
+            shape.pop(-1)
+    if shape is None:
+        return torch.tensor(0.0, device="meta")
+    return torch.empty(*shape, device="meta")
+
+
+def torch_bmm(input, mat2, *, out=None):
+    if out is not None:
+        raise ValueError("Don't support in-place bmm for MetaTensor analysis")
+    batch_size, n, m = input.shape
+    _, _, p = mat2.shape
+    return torch.empty(batch_size, n, p, device="meta")
+
+
+def torch_baddbmm(input, batch1, batch2, *, beta=1, alpha=1, out=None):
+    if out is not None:
+        raise ValueError("Don't support in-place baddbmm for MetaTensor analysis")
+    return torch_bmm(batch1, batch2)
+
+
+def torch_tensor_baddbmm(self, batch1, batch2, *, beta=1, alpha=1, out=None):
+    return torch_baddbmm(self, batch1, batch2, beta=beta, alpha=alpha, out=out)
+
+
+def torch_einsum(equation, *operands):
+    # TODO: infer shape without performing the computation, this might be quite hard.
+    concrete_operands = (torch.empty_like(operand, device="cpu") for operand in operands)
+    return torch.einsum(equation, *concrete_operands).to("meta")
+
+
+def torch_tensor_repeat(self, *sizes):
+    shape = list(self.shape)
+    for i, x in enumerate(sizes):
+        shape[i] *= x
+    return torch.empty(shape, device="meta")
+
+
+def torch_repeat_interleave(*args, dim=None, output_size=None):
+    num_args = len(args)
+    if num_args == 1:
+        shape = [output_size if output_size is not None else args[0].sum()]
+    else:
+        shape = list(args[0].shape)
+        if dim is None:
+            if num_args > 2:
+                dim = args[2]
+            else:
+                shape = [sum(shape)]
+                dim = 0
+        repeats = args[1]
+        if isinstance(repeats, int) or torch.numel(repeats) == 1:
+            shape[dim] *= int(repeats)
+        else:
+            shape[dim] = output_size if output_size is not None else repeats.sum()
+    return torch.empty(*shape, device="meta")
+
+
+def torch_index_select(input, dim, index, *, out=None):
+    shape = list(input.shape)
+    shape[dim] = len(index)
+    return torch.empty(*shape, device="meta")
+
+
+def torch_tensor_index_select(self, dim, index):
+    return torch_index_select(self, dim, index)
+
+
+def torch_gather(input, dim, index, *, sparse_grad=False, out=None):
+    shape = list(input.shape)
+    shape[dim] = index.shape[dim]
+    return torch.empty(*shape, device="meta")
+
+
+def torch_tensor_gather(self, dim, index):
+    return torch_gather(self, dim, index)
+
+
+def torch_roll(input, shifts, dims=None):
+    return input
+
+
+def torch_flip(input, dims):
+    return input
+
+
+def torch_tensor_flip(self, dims):
+    return self
+
+
+def torch_nn_conv1d(self, input):
+    l_in = input.shape[-1]
+    shape = None
+    padding = self.padding
+    if padding == "valid":
+        padding = (0, 0)
+    if padding == "same":
+        shape = list(input.shape)
+    if shape is None:
+        shape = list(input.shape)
+        l_out = math.floor(
+            (l_in + 2 * padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
+        )
+        shape[-1] = l_out
+    shape[-2] = self.out_channels
+    return torch.empty(shape, device="meta")
+
+
+def torch_nn_conv2d(self, input):
+    h_in, w_in = input.shape[-2:]
+    shape = None
+    padding = self.padding
+    if padding == "valid":
+        padding = (0, 0)
+    if padding == "same":
+        shape = list(input.shape)
+    if shape is None:
+        shape = list(input.shape)
+        h_out = math.floor(
+            (h_in + 2 * padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) / self.stride[0] + 1
+        )
+        w_out = math.floor(
+            (w_in + 2 * padding[1] - self.dilation[1] * (self.kernel_size[1] - 1) - 1) / self.stride[1] + 1
+        )
+        shape[-2:] = [h_out, w_out]
+    shape[-3] = self.out_channels
+    return torch.empty(shape, device="meta")
+
+
+def torch_squeeze(input, dim=None):
+    shape = list(input.shape)
+    if dim is not None:
+        if dim < 0:
+            dim = input.dim() + dim
+        if shape[dim] == 1:
+            shape.pop(dim)
+    else:
+        new_shape = []
+        for dim_value in shape:
+            if dim_value == 1:
+                continue
+            new_shape.append(dim_value)
+        shape = new_shape
+    return torch.empty(shape, device="meta")
+
+
+def torch_tensor_squeeze(self, dim=None):
+    return torch_squeeze(self, dim)
+
+
+def torch_unsqueeze(input, dim):
+    shape = list(input.shape)
+    if dim < 0:
+        dim = input.dim() + 1 + dim
+    shape.insert(dim, 1)
+    return torch.empty(shape, device="meta")
+
+
+def torch_tensor_unsqueeze(self, dim):
+    return torch_unsqueeze(self, dim)
+
+
+def torch_unique_consecutive(input, **kwargs):
+    output = torch.unique_consecutive(torch.zeros_like(input, device="cpu"), **kwargs)
+    if isinstance(output, torch.Tensor):
+        return output.to("meta")
+    else:
+        return tuple(map(output, lambda x: x.to("meta")))
+
+
+def torch_nn_functional_one_hot(tensor, num_classes=-1):
+    if num_classes < 0:
+        raise ValueError("Don't support automatic num_classes inference for MetaTensor analysis")
+    shape = list(tensor.shape) + [num_classes]
+    return torch.empty(shape, device="meta")
+
+
+def torch_nn_functional_scaled_dot_product_attention(
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+):
+    target_length = query.shape[-2]
+    head_dim = value.shape[-1]
+    return torch.empty((*query.shape[:-2], target_length, head_dim), device="meta")
+
+
+def torch_nn_mseloss(self, input, target):
+    if self.reduction == "none":
+        shape = target.shape
+    else:
+        shape = (1,)
+    return torch.empty(shape, device="meta")
+
+
+def torch_nn_crossentropyloss(self, input, target):
+    if self.reduction == "none":
+        shape = target.shape
+    else:
+        shape = (1,)
+    return torch.empty(shape, device="meta")
+
+
+def torch_nn_bcewithlogitsloss(self, input, target):
+    if self.reduction == "none":
+        shape = target.shape
+    else:
+        shape = (1,)
+    return torch.empty(shape, device="meta")
+
+
+def operator_getitem(a, b):
+    def to_concrete(t):
+        if isinstance(t, torch.Tensor):
+            concrete = torch.ones_like(t, device="cpu")
+            if concrete.dtype in [torch.float16, torch.float32, torch.float64, torch.int32]:
+                concrete = concrete.to(torch.int64)
+            return concrete
+        return t
+
+    if isinstance(a, torch.Tensor):
+        # TODO: infer shape without performing the computation.
+        if isinstance(b, tuple):
+            b = tuple(map(to_concrete, b))
+        else:
+            b = to_concrete(b)
+        return operator.getitem(torch.empty_like(a, device="cpu"), b).to("meta")
+    return operator.getitem(a, b)
+
+
+_MANUAL_META_OVERRIDES: dict[Callable, Callable] = {
+    torch.nn.Embedding: torch_nn_embedding,
+    torch.nn.functional.embedding: torch_nn_functional_embedding,
+    torch.nn.LayerNorm: torch_nn_layernorm,
+    torch.nn.GroupNorm: torch_nn_groupnorm,
+    torch.nn.Linear: torch_nn_linear,
+    torch.relu: torch_relu,
+    torch.nn.functional.relu: torch_nn_functional_relu,
+    torch.nn.ReLU: torch_nn_relu,
+    torch.where: torch_where,
+    torch.abs: torch_abs,
+    torch.arange: torch_arange,
+    torch.full: torch_full,
+    torch.cat: torch_cat,
+    torch.stack: torch_stack,
+    torch.add: torch_add,
+    torch.mul: torch_mul,
+    torch.Tensor.mul: torch_tensor_mul,
+    torch.matmul: torch_matmul,
+    torch.bmm: torch_bmm,
+    torch.baddbmm: torch_baddbmm,
+    torch.Tensor.baddbmm: torch_tensor_baddbmm,
+    torch.einsum: torch_einsum,
+    torch.Tensor.repeat: torch_tensor_repeat,
+    torch.repeat_interleave: torch_repeat_interleave,
+    torch.roll: torch_roll,
+    torch.flip: torch_flip,
+    torch.Tensor.flip: torch_tensor_flip,
+    torch.index_select: torch_index_select,
+    torch.Tensor.index_select: torch_tensor_index_select,
+    torch.gather: torch_gather,
+    torch.Tensor.gather: torch_tensor_gather,
+    torch.nn.Conv1d: torch_nn_conv1d,
+    torch.nn.Conv2d: torch_nn_conv2d,
+    torch.squeeze: torch_squeeze,
+    torch.Tensor.squeeze: torch_tensor_squeeze,
+    torch.unsqueeze: torch_unsqueeze,
+    torch.Tensor.unsqueeze: torch_tensor_unsqueeze,
+    torch.unique_consecutive: torch_unique_consecutive,
+    torch.nn.functional.one_hot: torch_nn_functional_one_hot,
+    torch.nn.MSELoss: torch_nn_mseloss,
+    torch.nn.CrossEntropyLoss: torch_nn_crossentropyloss,
+    torch.nn.BCEWithLogitsLoss: torch_nn_bcewithlogitsloss,
+    operator.getitem: operator_getitem,
+}
+
+_MANUAL_META_OVERRIDES[torch.nn.functional.scaled_dot_product_attention] = (
+    torch_nn_functional_scaled_dot_product_attention
+)
+
+
+class HFProxy(Proxy):
+    """
+    Proxy that uses metadata to handle data-dependent control-flow.
+    """
+
+    def install_metadata(self, metadata):
+        self._metadata = metadata
+
+    @property
+    def shape(self):
+        return self.tracer.create_proxy("call_method", "size", (self,), {})
+
+    @property
+    def device(self):
+        # Hack so we can track when devices are used. During meta-tensor propagation,
+        # replace these values with a constant 'meta'
+        return MetaDeviceAttribute(self, "device")
+
+    def __len__(self):
+        if hasattr(self, "_metadata") and self._metadata is not None:
+            return len(self._metadata)
+        return super().__len__()
+
+    def __bool__(self):
+        if hasattr(self, "_metadata") and self._metadata is not None:
+            return self._metadata
+        return super().__bool__()
+
+    def __getattr__(self, k):
+        if k == "_metadata":
+            return self.__getattribute__(k)
+        # note: not added to the graph yet, if this is a method call
+        # we peephole optimize to the method invocation
+        return HFAttribute(self, k)
+
+    def __setitem__(self, indices, values):
+        return self.tracer.create_proxy("call_function", operator.setitem, (self, indices, values), {})
+
+    def __contains__(self, key):
+        if hasattr(self, "_metadata") and self._metadata is not None:
+            return key in self._metadata
+        return super().__contains__(key)
+
+
+class HFAttribute(HFProxy):
+    def __init__(self, root, attr: str):
+        self.root = root
+        self.attr = attr
+        self.tracer = root.tracer
+        self._node = None
+
+        if hasattr(self.root, "_metadata"):
+            self.install_metadata(getattr(self.root._metadata, attr))
+
+    @property
+    def node(self):
+        # the node for attributes is added lazily, since most will just be method calls
+        # which do not rely on the getitem call
+        if self._node is None:
+            self._node = self.tracer.create_proxy("call_function", builtins.getattr, (self.root, self.attr), {}).node
+        return self._node
+
+    def __call__(self, *args, **kwargs):
+        return self.tracer.create_proxy("call_method", self.attr, (self.root,) + args, kwargs)
+
+
+class MetaDeviceAttribute(HFAttribute):
+    pass
+
+
+class HFCacheProxy(HFProxy):
+    """
+    Proxy that represents an instance of `transformers.cache_utils.Cache`.
+    """
+
+    def install_orig_cache_cls(self, orig_cache_cls: type[Cache]):
+        self._orig_cache_cls = orig_cache_cls
+
+    @property
+    def __class__(self):
+        if not hasattr(self, "_orig_cache_cls"):
+            raise RuntimeError("The original Cache class must be installed to the HFCacheProxy.")
+        return self.tracer._CLASSES_TO_PATCH[self._orig_cache_cls]
+
+
+def create_wrapper(
+    function: Callable,
+    op_type: Union[Literal["call_function"], Literal["call_method"], Literal["get_attr"]],
+    proxy_factory_fn: Optional[Callable[[Node], Proxy]] = None,
+) -> Callable:
+    @functools.wraps(function)
+    def wrapper(*args, **kwargs):
+        if not is_fx_tracing():
+            return function(*args, **kwargs)
+
+        found_proxies = []
+
+        def check_proxy(a):
+            if isinstance(a, Proxy):
+                found_proxies.append(a)
+
+        torch.fx.node.map_aggregate(args, check_proxy)
+        torch.fx.node.map_aggregate(kwargs, check_proxy)
+
+        if len(found_proxies) > 0:
+            tracer = found_proxies[0].tracer
+            if op_type == "call_function":
+                target = function
+            elif op_type == "call_method":
+                target = function.__name__
+            elif op_type == "get_attr":
+                target = function.__name__
+            else:
+                raise ValueError(f"op_type {op_type} not supported.")
+            return tracer.create_proxy(op_type, target, args, kwargs, proxy_factory_fn=proxy_factory_fn)
+        else:
+            return function(*args, **kwargs)
+
+    return wrapper
+
+
+class HFProxyableClassMeta(type):
+    """
+    Metaclass that creates a class with its main methods wrapped to be proxyable.
+    """
+
+    def __new__(
+        cls,
+        name: str,
+        bases: tuple[type, ...],
+        attrs: dict[str, Any],
+        proxy_factory_fn: Optional[Callable[[Node], Proxy]] = None,
+    ):
+        cls = super().__new__(cls, name, bases, attrs)
+        for attr_name in dir(cls):
+            attr = getattr(cls, attr_name, None)
+            if attr is None:
+                continue
+            if attr_name == "__init__":
+                op_type = "call_function"
+            elif attr_name.startswith("__"):
+                op_type = None
+            elif inspect.ismethod(attr):
+                op_type = "call_function"
+            elif inspect.isfunction(attr):
+                op_type = "call_method"
+            else:
+                op_type = None
+            if op_type is not None:
+                setattr(cls, attr_name, create_wrapper(attr, op_type, proxy_factory_fn=proxy_factory_fn))
+        return cls
+
+
+def gen_constructor_wrapper(target: Callable) -> tuple[Callable, Callable]:
+    """
+    Wraps `target` to be proxyable. Used for tensor creators like `torch.ones`, `torch.arange` and so on.
+    """
+    wrapper = create_wrapper(target, "call_function")
+    return wrapper, target
+
+
+def _proxies_to_metas(v):
+    """Returns the underlying metadata for HFProxies, and behaves like the identity for the others."""
+    if isinstance(v, MetaDeviceAttribute):
+        return "meta"
+    if isinstance(v, torch.fx.Proxy):
+        if not (isinstance(v, HFProxy) and hasattr(v, "_metadata")):
+            raise RuntimeError(f"No metadata was found for {v}")
+        return v._metadata
+    return v
+
+
+def create_cache_proxy_factory_fn(orig_cache_cls: type[Cache]) -> Callable[[Node], HFCacheProxy]:
+    def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
+        global _CURRENT_TRACER
+        if not isinstance(_CURRENT_TRACER, HFTracer):
+            raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
+        cache_proxy = HFCacheProxy(n, _CURRENT_TRACER)
+        cache_proxy.install_orig_cache_cls(orig_cache_cls)
+        return cache_proxy
+
+    return cache_proxy_factory_fn
+
+
+# Proxyable equivalent of the cache classes defined in `transformers.cache_utils`.
+ProxyableCache = HFProxyableClassMeta(
+    "ProxyableCache", (Cache,), {}, proxy_factory_fn=create_cache_proxy_factory_fn(Cache)
+)
+ProxyableDynamicCache = HFProxyableClassMeta(
+    "ProxyableDynamicCache",
+    (DynamicCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(DynamicCache),
+)
+ProxyableSinkCache = HFProxyableClassMeta(
+    "ProxyableSinkCache",
+    (SinkCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(SinkCache),
+)
+ProxyableStaticCache = HFProxyableClassMeta(
+    "ProxyableStaticCache",
+    (StaticCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(StaticCache),
+)
+
+
+def _generate_random_int(low: int = 10, high: int = 20, forbidden_values: Optional[list[int]] = None):
+    if forbidden_values is None:
+        forbidden_values = []
+    value = random.randint(low, high)
+    while value in forbidden_values:
+        value = random.randint(low, high)
+    return value
+
+
+class HFTracer(Tracer):
+    """
+    Tracer that is able to symbolically trace models from the library. To do that, it uses the HFProxy instead of the
+    regular PyTorch torch.fx.Proxy.
+    """
+
+    # Feature flag for proxying accesses to buffer values
+    proxy_buffer_attributes: bool = True
+    allow_insert_stateless_mods: bool = True
+    _TORCH_METHODS_TO_PATCH = [
+        "arange",
+        "zeros",
+        "ones",
+        "full",
+        "full_like",
+        "eye",
+        "empty",
+        "tensor",
+        "clamp",
+        "finfo",
+        "tril",
+    ]
+    _CLASSES_TO_PATCH = {
+        Cache: ProxyableCache,
+        DynamicCache: ProxyableDynamicCache,
+        SinkCache: ProxyableSinkCache,
+        StaticCache: ProxyableStaticCache,
+    }
+
+    supported_archs = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel)
+
+    def __init__(self, autowrap_modules=(math,), autowrap_functions=()):
+        super().__init__(autowrap_modules=autowrap_modules, autowrap_functions=autowrap_functions)
+
+    def _generate_dummy_input(
+        self, model: "PreTrainedModel", input_name: str, shape: list[int], input_names: list[str]
+    ) -> dict[str, torch.Tensor]:
+        """Generates dummy input for model inference recording."""
+        # Retrieving the model class, either from the "class_for_deserialization" attribute if the model was restored
+        # from pickle, or from the "__class__" attribute in the general case.
+        model_class_name = getattr(model, "class_for_deserialization", model.__class__).__name__
+        device = model.device
+        inputs_dict = {}
+
+        # when tracing a model with KV cache, we simply need to unsure that the KV cache length is larger than one to
+        # rightfully pass certain controlflows (Example: https://github.com/huggingface/transformers/blob/5c8d941d66734811d2ef6f57f15b44f7fb7a98c4/src/transformers/modeling_attn_mask_utils.py#L162).
+        # After tracing, the model can then still be used with arbitrary lengths different than the one used during tracing.
+        kv_cache_length = 5
+
+        if input_name in ["labels", "start_positions", "end_positions"]:
+            batch_size = shape[0]
+            if model_class_name in [
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES),
+                *get_values(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES),
+            ]:
+                inputs_dict["labels"] = torch.zeros(batch_size, dtype=torch.long, device=device)
+            elif model_class_name in [
+                *get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES),
+                *get_values(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES),
+                "XLNetForQuestionAnswering",
+            ]:
+                inputs_dict["start_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)
+                inputs_dict["end_positions"] = torch.zeros(batch_size, dtype=torch.long, device=device)
+            elif model_class_name in get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES):
+                if not hasattr(model.config, "problem_type") or model.config.problem_type is None:
+                    raise ValueError(
+                        "Could not retrieve the problem type for the sequence classification task, please set "
+                        'model.config.problem_type to one of the following values: "regression", '
+                        '"single_label_classification", or "multi_label_classification".'
+                    )
+
+                if model.config.problem_type == "regression":
+                    labels_shape = (batch_size, model.config.num_labels)
+                    labels_dtype = torch.float32
+                elif model.config.problem_type == "single_label_classification":
+                    labels_shape = (batch_size,)
+                    labels_dtype = torch.long
+                elif model.config.problem_type == "multi_label_classification":
+                    labels_shape = (batch_size, model.config.num_labels)
+                    labels_dtype = torch.float32
+                else:
+                    raise ValueError(
+                        'Expected model.config.problem_type to be either: "regression", "single_label_classification"'
+                        f', or "multi_label_classification", but "{model.config.problem_type}" was provided.'
+                    )
+                inputs_dict["labels"] = torch.zeros(*labels_shape, dtype=labels_dtype, device=device)
+
+            elif model_class_name in [
+                *get_values(MODEL_FOR_PRETRAINING_MAPPING_NAMES),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),
+                *get_values(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES),
+                "GPT2DoubleHeadsModel",
+                "PeftModelForCausalLM",
+                "PeftModelForSeq2SeqLM",
+            ]:
+                inputs_dict["labels"] = torch.zeros(shape, dtype=torch.long, device=device)
+            elif model_class_name in [*get_values(MODEL_FOR_CTC_MAPPING_NAMES)]:
+                inputs_dict["labels"] = torch.zeros(shape, dtype=torch.float32, device=device)
+            else:
+                raise NotImplementedError(
+                    f"Generating the dummy input named {input_name} for {model_class_name} is not supported yet."
+                )
+        elif "pixel_values" in input_name:
+            batch_size = shape[0]
+            image_size = getattr(model.config, "image_size", None)
+            if image_size is None:
+                if hasattr(model.config, "vision_config"):
+                    image_size = model.config.vision_config.image_size
+                elif hasattr(model.config, "encoder"):
+                    image_size = model.config.encoder.image_size
+                else:
+                    image_size = (_generate_random_int(), _generate_random_int())
+
+            # If no num_channels is in the config, use some arbitrary value.
+            num_channels = getattr(model.config, "num_channels", 3)
+            if not isinstance(image_size, collections.abc.Iterable):
+                image_size = (image_size, image_size)
+            height, width = image_size
+            inputs_dict[input_name] = torch.zeros(
+                batch_size, num_channels, height, width, dtype=torch.float32, device=device
+            )
+        elif "bbox" in input_name:
+            inputs_dict[input_name] = torch.zeros(*shape, 4, dtype=torch.float, device=device)
+        elif "input_features" in input_name:
+            inputs_dict[input_name] = torch.zeros(
+                *shape, model.config.input_feat_per_channel, dtype=torch.float, device=device
+            )
+        elif "inputs_embeds" in input_name:
+            batch_size = shape[0]
+
+            if (
+                getattr(model.config, "embedding_size", None) is not None
+                and model.config.model_type != "megatron-bert"
+            ):
+                embedding_size = model.config.embedding_size
+            else:
+                embedding_size = model.config.hidden_size
+
+            if len(shape) == 3:
+                # (batch_size, num_choices, sequence_length, embedding_size)
+                embedding_shape = (batch_size, shape[1], shape[2], embedding_size)
+            else:
+                # (batch_size, sequence_length, embedding_size)
+                embedding_shape = (batch_size, shape[1], embedding_size)
+
+            inputs_dict[input_name] = torch.zeros(embedding_shape, dtype=torch.float, device=device)
+        elif "visual_feats" in input_name:
+            inputs_dict[input_name] = torch.zeros(
+                shape
+                + [
+                    model.config.visual_feat_dim,
+                ],
+                dtype=torch.float,
+                device=device,
+            )
+        elif "visual_pos" in input_name:
+            inputs_dict[input_name] = torch.zeros(
+                shape
+                + [
+                    model.config.visual_pos_dim,
+                ],
+                dtype=torch.float,
+                device=device,
+            )
+        elif "inputs" in input_name:
+            inputs_dict[input_name] = torch.zeros(*shape, dtype=torch.float, device=device)
+        elif "input_values" in input_name:
+            batch_size, _ = shape
+            # Generating big sequence length for audio inputs.
+            seq_length = _generate_random_int(low=10000, high=20000)
+            inputs_dict[input_name] = torch.zeros(batch_size, seq_length, dtype=torch.float, device=device)
+        elif "mask" in input_name:
+            if "past_key_values" in input_names:
+                mask_shape = [shape[0], shape[1] + kv_cache_length]
+            else:
+                mask_shape = shape
+
+            inputs_dict[input_name] = torch.zeros(mask_shape, dtype=torch.long, device=device)
+        elif "ids" in input_name:
+            inputs_dict[input_name] = torch.zeros(shape, dtype=torch.long, device=device)
+        elif "past_key_values" in input_name:
+            if model.config.model_type not in _FX_SUPPORTED_MODELS_WITH_KV_CACHE:
+                raise NotImplementedError(
+                    f"Symbolic trace with past_key_values input is not supported yet for the model {model.config.model_type}. Please open an issue or a PR in Transformers repository if you would like to see the support added."
+                )
+            num_heads = model.config.num_attention_heads
+            head_dim = model.config.hidden_size // model.config.num_attention_heads
+
+            cache_shape = (shape[0], num_heads, kv_cache_length, head_dim)
+            pkv = tuple(
+                (
+                    torch.rand(cache_shape, dtype=torch.float, device=device),
+                    torch.rand(cache_shape, dtype=torch.float, device=device),
+                )
+                for i in range(model.config.num_hidden_layers)
+            )
+            inputs_dict[input_name] = pkv
+        else:
+            shape_with_hidden_size = shape + [model.config.hidden_size]
+            inputs_dict[input_name] = torch.zeros(shape_with_hidden_size, dtype=torch.float, device=device)
+
+        return inputs_dict
+
+    def create_proxy(self, kind, target, args, kwargs, name=None, type_expr=None, proxy_factory_fn=None):
+        rv = super().create_proxy(kind, target, args, kwargs, name, type_expr, proxy_factory_fn)
+
+        if kind == "placeholder" and target in self.meta_args:
+            rv.install_metadata(self.meta_args[target])
+            return rv
+
+        if target in self.orig_fns:
+            # NOTE: tensor constructors in PyTorch define the `device` argument as
+            # *kwargs-only*. That is why this works. If you add methods to
+            # _TORCH_METHODS_TO_PATCH that do not define `device` as kwarg-only,
+            # this will break and you will likely see issues where we cannot infer
+            # the size of the output.
+            if "device" in kwargs:
+                kwargs["device"] = "meta"
+
+        try:
+            args_metas = torch.fx.node.map_aggregate(args, _proxies_to_metas)
+            kwargs_metas = torch.fx.node.map_aggregate(kwargs, _proxies_to_metas)
+
+            should_install_metadata = True
+
+            self._disable_module_getattr = True
+            self._disable_call_module = True
+
+            if kind == "call_function":
+                meta_target = _MANUAL_META_OVERRIDES.get(target, target)
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+                if isinstance(meta_out, torch.Tensor):
+                    meta_out = meta_out.to(device="meta")
+            elif kind == "call_method":
+                method = getattr(args_metas[0].__class__, target)
+                meta_target = _MANUAL_META_OVERRIDES.get(method, method)
+                meta_out = meta_target(*args_metas, **kwargs_metas)
+            elif kind == "call_module":
+                if not hasattr(self, "orig_forward"):
+                    raise AttributeError(f"{self} does not have an attribute called orig_forward")
+                mod = self.root.get_submodule(target)
+                mod_type = type(mod)
+                if mod_type in _MANUAL_META_OVERRIDES:
+                    meta_out = _MANUAL_META_OVERRIDES[mod_type](mod, *args_metas, **kwargs_metas)
+                else:
+                    meta_out = self.orig_forward(*args_metas, **kwargs_metas)
+            elif kind == "get_attr":
+                attr_itr = self.root
+                atoms = target.split(".")
+                for atom in atoms:
+                    attr_itr = getattr(attr_itr, atom)
+                if isinstance(attr_itr, torch.Tensor):
+                    meta_out = attr_itr.to(device="meta")
+                else:
+                    meta_out = attr_itr
+            else:
+                should_install_metadata = False
+
+            if should_install_metadata:
+                if not isinstance(rv, Proxy):
+                    raise ValueError("Don't support composite output yet")
+                rv.install_metadata(meta_out)
+
+        except Exception as e:
+            if _IS_IN_DEBUG_MODE:
+                warnings.warn(f"Could not compute metadata for {kind} target {target}: {e}")
+
+        self._disable_module_getattr = False
+        self._disable_call_module = False
+
+        return rv
+
+    # Replaced by .getattr from PyTorch 1.13
+    def _module_getattr(self, attr, attr_val, parameter_proxy_cache):
+        if getattr(self, "_disable_module_getattr", False):
+            return attr_val
+        else:
+
+            def maybe_get_proxy_for_attr(attr_val, collection_to_search, parameter_proxy_cache):
+                for n, p in collection_to_search:
+                    if attr_val is p:
+                        if n not in parameter_proxy_cache:
+                            kwargs = {}
+                            if "proxy_factory_fn" in inspect.signature(self.create_proxy).parameters:
+                                kwargs["proxy_factory_fn"] = (
+                                    None
+                                    if not self.param_shapes_constant
+                                    else lambda node: ParameterProxy(self, node, n, attr_val)
+                                )
+                            val_proxy = self.create_proxy("get_attr", n, (), {}, **kwargs)  # type: ignore[arg-type]
+                            parameter_proxy_cache[n] = val_proxy
+                        return parameter_proxy_cache[n]
+                return None
+
+            if isinstance(attr_val, torch.nn.Parameter):
+                maybe_parameter_proxy = maybe_get_proxy_for_attr(
+                    attr_val, self.root.named_parameters(), parameter_proxy_cache
+                )
+                if maybe_parameter_proxy is not None:
+                    return maybe_parameter_proxy
+
+            if self.proxy_buffer_attributes and isinstance(attr_val, torch.Tensor):
+                maybe_buffer_proxy = maybe_get_proxy_for_attr(
+                    attr_val, self.root.named_buffers(), parameter_proxy_cache
+                )
+                if maybe_buffer_proxy is not None:
+                    return maybe_buffer_proxy
+
+            return attr_val
+
+    # Needed for PyTorch 1.13+
+    def getattr(self, attr: str, attr_val: Any, parameter_proxy_cache: dict[str, Any]):
+        return self._module_getattr(attr, attr_val, parameter_proxy_cache)
+
+    def call_module(self, m, forward, args, kwargs):
+        if getattr(self, "_disable_call_module", False):
+            return forward(*args, **kwargs)
+        self.orig_forward = forward
+        return super().call_module(m, forward, args, kwargs)
+
+    def proxy(self, node):
+        return HFProxy(node, self)
+
+    @contextlib.contextmanager
+    def patch_for_tracing(self, root: Union[torch.nn.Module, Callable[..., Any]]):
+        # Patching torch functions
+        self.patched_torch_methods = {
+            target: gen_constructor_wrapper(getattr(torch, target)) for target in self._TORCH_METHODS_TO_PATCH
+        }
+        self.orig_fns = set()
+
+        for name, (wrapper, orig) in self.patched_torch_methods.items():
+            setattr(torch, name, wrapper)
+            self.orig_fns.add(orig)
+
+        # Patching classes
+        patched = []
+        module_of_model = inspect.getmodule(root)
+        for name, mod in sys.modules.items():
+            if module_of_model is not None and mod is not module_of_model:
+                continue
+            if not name.startswith("transformers"):
+                continue
+            for orig_cls, patched_cls in self._CLASSES_TO_PATCH.items():
+                for attr_name, attr in mod.__dict__.items():
+                    if attr is orig_cls:
+                        patched.append((mod, attr_name, orig_cls))
+                        setattr(mod, attr_name, patched_cls)
+
+        yield
+
+        # Restoring patched functions and classes.
+        for name, (_, orig) in self.patched_torch_methods.items():
+            setattr(torch, name, orig)
+        self.patched_torch_methods = {}
+        self.orig_fns = set()
+
+        for mod, attr_name, orig_cls in patched:
+            setattr(mod, attr_name, orig_cls)
+
+    def trace(
+        self,
+        root: Union[torch.nn.Module, Callable[..., Any]],
+        concrete_args: Optional[dict[str, Any]] = None,
+        dummy_inputs: Optional[dict[str, Any]] = None,
+        complete_concrete_args_with_inputs_not_in_dummy_inputs: bool = True,
+    ) -> Graph:
+        """
+        Traces `root` and returns the corresponding FX `torch.fx.Graph` representation. `root` can either be a
+        `torch.nn.Module` instance or a Python callable. Note that after this call, `self.root` may be different from
+        the `root` passed in here. For example, when a free function is passed to `trace()`, we will create a
+        `torch.nn.Module` instance to use as the root and add embedded constants to.
+
+        Args:
+            root (`torch.nn.Module` or  `Callable`):
+                Either a `torch.nn.Module`` or a function to be traced through. If root is not a
+                [`~transformers.PreTrainedModel`], then `dummy_inputs` must be passed, otherwise tracing will fail.
+            concrete_args (`Dict[str, Any], *optional*):
+                Concrete arguments that should not be treated as Proxies
+            dummy_inputs (`Dict[str, Any]`, *optional*):
+                The dummy inputs needed to handle data-dependent control-flow if `root` is not a
+                [`~transformers.PreTrainedModel`]. It can also be used when `root` is a
+                [`~transformers.PreTrainedModel`] to specify custom dummy inputs for a subset or all the model inputs.
+            complete_concrete_args_with_inputs_not_in_dummy_inputs (`bool`, *optional*, defaults to `True`):
+                If `True`, and `dummy_inputs` is specified, every argument that `root` can take that is not in
+                `dummy_inputs` and not in `concrete_args` will be added to `concrete_args`, otherwise does nothing.
+
+        Returns:
+            `torch.fx.Graph`:
+                A FX `torch.fx.Graph` representing the semantics of the passed-in `root`.
+
+        """
+        sig = inspect.signature(root.forward if isinstance(root, torch.nn.Module) else root)
+
+        if concrete_args is None:
+            concrete_args = {}
+
+        if dummy_inputs is not None and complete_concrete_args_with_inputs_not_in_dummy_inputs:
+            for param in sig.parameters.values():
+                if param.name in dummy_inputs:
+                    continue
+                if param.default is inspect.Parameter.empty:
+                    raise ValueError(f"You need to specify a default value for the parameter {param.name}.")
+            concrete_args.update(
+                {
+                    p.name: p.default
+                    for p in sig.parameters.values()
+                    if (p.name not in dummy_inputs and p.name not in concrete_args)
+                }
+            )
+
+        input_names = sig.parameters.keys() - concrete_args.keys()
+
+        # Creating a random input shape to generate dummy inputs.
+        batch_size = _generate_random_int()
+        sequence_length = _generate_random_int()
+        shape = [batch_size, sequence_length]
+
+        if root.__class__.__name__ in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES):
+            num_choices = _generate_random_int(low=2, high=5)
+            shape.insert(1, num_choices)
+
+        inputs = dict(dummy_inputs) if dummy_inputs is not None else {}
+        for input_name in input_names:
+            if input_name in inputs:
+                continue
+            # We enforce that root must either be a PreTrainedModel or deserialized from a serialized traced model to
+            # be able to use HFTracer._generate_dummy_input.
+            if isinstance(root, self.supported_archs) or type(root).__qualname__.startswith(
+                ("_deserialize_graph_module", "_CodeOnlyModule")
+            ):
+                inputs.update(self._generate_dummy_input(root, input_name, shape, input_names=input_names))
+            else:
+                raise RuntimeError(
+                    f"Could not generate input named {input_name} for because root is not a"
+                    " transformers.PreTrainedModel."
+                )
+
+        def to_meta(value):
+            if isinstance(value, torch.Tensor):
+                return value.to("meta")
+            return value
+
+        concrete_metas = pytree.tree_map(to_meta, inputs)
+
+        for param in sig.parameters.values():
+            if param.kind == inspect.Parameter.VAR_KEYWORD and param.name not in input_names:
+                concrete_metas[f"**{param.name}"] = {}
+        self.meta_args = concrete_metas
+
+        global _CURRENT_TRACER
+        _CURRENT_TRACER = self
+        with self.patch_for_tracing(root):
+            try:
+                self.graph = super().trace(root, concrete_args=concrete_args)
+            finally:
+                _CURRENT_TRACER = None
+
+        # This is necessary because concrete args are added as input to the traced module since
+        # https://github.com/pytorch/pytorch/pull/55888.
+        for node in self.graph.nodes:
+            if node.op == "placeholder":
+                # Removing default values for inputs as the forward pass will fail with them.
+                if node.target in input_names:
+                    node.args = ()
+                    # Without this, torch.jit.script fails because the inputs type is Optional[torch.Tensor].
+                    # It cannot infer on the attributes and methods the input should have, and fails.
+                    node.type = torch.Tensor
+                # It is a concrete arg so it is not used and should be removed.
+                else:
+                    to_visit = [node]
+                    to_delete = collections.OrderedDict()
+                    while to_visit:
+                        n = to_visit.pop(0)
+                        to_delete[n] = None
+                        to_visit += list(n.users.keys())
+
+                    for user in reversed(to_delete.keys()):
+                        self.graph.erase_node(user)
+
+            # TODO: solves GraphModule creation.
+            # Without this, return type annotation "Tuple" is causing code execution failure.
+            if node.op == "output":
+                node.type = None
+
+        return self.graph
+
+    def _stateless_mod_instanciation_depends_on_proxies(self, mod: nn.Module) -> bool:
+        """
+        Whether the module was instantiated with Proxies. If that is the case, such module cannot be a leaf module
+        because its attributes are input-dependent.
+        """
+        return any(isinstance(attr, Proxy) for attr in mod.__dict__.values())
+
+    def _insert_module_as_submodule(self, mod: nn.Module) -> str:
+        """
+        Helper method which tries to insert a module that was not declared as submodule.
+        """
+        # If one of the module attributes is a Proxy, it means that its instantiation is input-dependent.
+        # It is not possible to insert such modules, those should be traced through.
+        if self._stateless_mod_instanciation_depends_on_proxies(mod):
+            return ""
+        idx = 0
+        mod_name = mod.__class__.__name__.lower()
+        path = f"{mod_name}_{idx}"
+        already_inserted = False
+        while hasattr(self.root, path):
+            if getattr(self.root, path) is mod:
+                already_inserted = True
+                break
+            path = f"{mod_name}_{idx}"
+            idx += 1
+
+        # No need to add multiple instances of the same module.
+        if not already_inserted:
+            self.root.add_module(path, mod)
+        return path
+
+    def path_of_module(self, mod: nn.Module) -> str:
+        """
+        Helper method to find the qualified name of `mod` in the Module hierarchy of `root`. For example, if `root` has
+        a submodule named `foo`, which has a submodule named `bar`, passing `bar` into this function will return the
+        string "foo.bar".
+
+        Args:
+            mod (str): The `Module` to retrieve the qualified name for.
+        """
+        try:
+            return super().path_of_module(mod)
+        except NameError as e:
+            if self.allow_insert_stateless_mods and len(list(mod.parameters())) == 0 and len(list(mod.buffers())) == 0:
+                path = self._insert_module_as_submodule(mod)
+                return path
+            raise e
+
+    def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
+        return (not self._stateless_mod_instanciation_depends_on_proxies(m)) and super().is_leaf_module(
+            m, module_qualified_name
+        )
+
+    @compatibility(is_backward_compatible=True)
+    def keys(self, obj: "Proxy") -> Any:
+        """Called when a proxy object is has the keys() method called.
+        This is what happens when ** is called on a proxy. This should return an iterator if ** is supposed to work in
+        your custom tracer.
+        """
+        attribute = HFAttribute(obj, "keys")()
+        if obj.node.target.startswith("**"):
+            return attribute._metadata
+        return attribute
+
+
+def get_concrete_args(model: nn.Module, input_names: list[str]):
+    sig = inspect.signature(model.forward)
+
+    if not (set(input_names) <= set(sig.parameters.keys())):
+        formatted_input_names = input_names[0] if len(input_names) == 1 else ", ".join(input_names)
+        formatted_allowed_input_names = ", ".join(sig.parameters.keys())
+        raise ValueError(
+            f"The model does not have input(s) named: {formatted_input_names}, expected a subset of the following:"
+            f" {formatted_allowed_input_names}"
+        )
+
+    return {p.name: p.default for p in sig.parameters.values() if p.name not in input_names}
+
+
+def is_model_supported(model: "PreTrainedModel"):
+    return model.__class__.__name__ in _SUPPORTED_MODELS
+
+
+def check_if_model_is_supported(model: "PreTrainedModel"):
+    if not is_model_supported(model):
+        supported_model_names = ", ".join(_SUPPORTED_MODELS)
+        raise NotImplementedError(
+            f"Model {model.__class__.__name__} is not supported yet, supported models: {supported_model_names}"
+        )
+
+
+def symbolic_trace(
+    model: "PreTrainedModel",
+    input_names: Optional[list[str]] = None,
+    disable_check: bool = False,
+    tracer_cls: type[HFTracer] = HFTracer,
+) -> GraphModule:
+    """
+    Performs symbolic tracing on the model.
+
+    Args:
+        model ([`PretrainedModel`]):
+            The model to trace.
+        input_names (`List[str]`, *optional*):
+            The names of the inputs of the traced model. If unset, model.dummy_inputs.keys() are used instead.
+        disable_check (`bool`, *optional*, defaults to `False`):
+            If `True`, no check is done before trying to trace the model, this is mostly usesul for debugging purposes.
+        tracer_cls (`Type[HFTracer]`, *optional*, defaults to `HFTracer`):
+            The tracer class to use for instantiating the tracer. If unset, `HFTracer` is used instead.
+
+    Returns:
+        `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
+
+    Example:
+
+        ```python
+        from transformers.utils.fx import symbolic_trace
+
+        traced_model = symbolic_trace(model, input_names=["input_ids", "attention_mask", "token_type_ids"])
+        ```
+    """
+    if input_names is None:
+        input_names = model.dummy_inputs.keys()
+
+    input_names = list(input_names)
+    concrete_args = get_concrete_args(model, input_names)
+
+    if not disable_check:
+        check_if_model_is_supported(model)
+
+    if "past_key_values" in input_names and not getattr(model.config, "use_cache", False):
+        logger.warning(
+            "`past_key_values` were specified as input names, but model.config.use_cache = False, this might lead to "
+            "unexpected behavior."
+        )
+    if "past_key_values" not in input_names and getattr(model.config, "use_cache", False):
+        logger.warning(
+            "`past_key_values` were not specified as input names, but model.config.use_cache = True. Setting "
+            "model.config.use_cache = False."
+        )
+        model.config.use_cache = False
+
+    # Tracing.
+    tracer = tracer_cls()
+    traced_graph = tracer.trace(model, concrete_args=concrete_args)
+    traced = torch.fx.GraphModule(model, traced_graph)
+
+    traced.config = model.config
+    # The model class must be stored as an attribute to allow model deserialization, which uses trace, and thus
+    # _generate_dummy_input, where the model class is needed.
+    traced.class_for_deserialization = model.__class__
+    traced.device = model.device
+
+    return traced
diff --git a/docs/transformers/build/lib/transformers/utils/generic.py b/docs/transformers/build/lib/transformers/utils/generic.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e06241205d64d26666a9459abf3af6fa7a16685
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/generic.py
@@ -0,0 +1,979 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generic utilities
+"""
+
+import inspect
+import json
+import os
+import tempfile
+import warnings
+from collections import OrderedDict, UserDict
+from collections.abc import Iterable, MutableMapping
+from contextlib import ExitStack, contextmanager
+from dataclasses import fields, is_dataclass
+from enum import Enum
+from functools import partial, wraps
+from typing import Any, ContextManager, Optional, TypedDict
+
+import numpy as np
+from packaging import version
+
+from .import_utils import (
+    get_torch_version,
+    is_flax_available,
+    is_mlx_available,
+    is_tf_available,
+    is_torch_available,
+    is_torch_fx_proxy,
+)
+
+
+if is_torch_available():
+    # required for @can_return_tuple decorator to work with torchdynamo
+    import torch  # noqa: F401
+
+
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+
+    From tensorflow_datasets
+
+    Built-in in functools from Python 3.8.
+    """
+
+    def __get__(self, obj, objtype=None):
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+
+
+# vendored from distutils.util
+def strtobool(val):
+    """Convert a string representation of truth to true (1) or false (0).
+
+    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+    Raises ValueError if 'val' is anything else.
+    """
+    val = val.lower()
+    if val in {"y", "yes", "t", "true", "on", "1"}:
+        return 1
+    if val in {"n", "no", "f", "false", "off", "0"}:
+        return 0
+    raise ValueError(f"invalid truth value {val!r}")
+
+
+def infer_framework_from_repr(x):
+    """
+    Tries to guess the framework of an object `x` from its repr (brittle but will help in `is_tensor` to try the
+    frameworks in a smart order, without the need to import the frameworks).
+    """
+    representation = str(type(x))
+    if representation.startswith("<class 'torch."):
+        return "pt"
+    elif representation.startswith("<class 'tensorflow."):
+        return "tf"
+    elif representation.startswith("<class 'jax"):
+        return "jax"
+    elif representation.startswith("<class 'numpy."):
+        return "np"
+    elif representation.startswith("<class 'mlx."):
+        return "mlx"
+
+
+def _get_frameworks_and_test_func(x):
+    """
+    Returns an (ordered since we are in Python 3.7+) dictionary framework to test function, which places the framework
+    we can guess from the repr first, then Numpy, then the others.
+    """
+    framework_to_test = {
+        "pt": is_torch_tensor,
+        "tf": is_tf_tensor,
+        "jax": is_jax_tensor,
+        "np": is_numpy_array,
+        "mlx": is_mlx_array,
+    }
+    preferred_framework = infer_framework_from_repr(x)
+    # We will test this one first, then numpy, then the others.
+    frameworks = [] if preferred_framework is None else [preferred_framework]
+    if preferred_framework != "np":
+        frameworks.append("np")
+    frameworks.extend([f for f in framework_to_test if f not in [preferred_framework, "np"]])
+    return {f: framework_to_test[f] for f in frameworks}
+
+
+def is_tensor(x):
+    """
+    Tests if `x` is a `torch.Tensor`, `tf.Tensor`, `jaxlib.xla_extension.DeviceArray`, `np.ndarray` or `mlx.array`
+    in the order defined by `infer_framework_from_repr`
+    """
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(x)
+    for test_func in framework_to_test_func.values():
+        if test_func(x):
+            return True
+
+    # Tracers
+    if is_torch_fx_proxy(x):
+        return True
+
+    if is_flax_available():
+        from jax.core import Tracer
+
+        if isinstance(x, Tracer):
+            return True
+
+    return False
+
+
+def _is_numpy(x):
+    return isinstance(x, np.ndarray)
+
+
+def is_numpy_array(x):
+    """
+    Tests if `x` is a numpy array or not.
+    """
+    return _is_numpy(x)
+
+
+def _is_torch(x):
+    import torch
+
+    return isinstance(x, torch.Tensor)
+
+
+def is_torch_tensor(x):
+    """
+    Tests if `x` is a torch tensor or not. Safe to call even if torch is not installed.
+    """
+    return False if not is_torch_available() else _is_torch(x)
+
+
+def _is_torch_device(x):
+    import torch
+
+    return isinstance(x, torch.device)
+
+
+def is_torch_device(x):
+    """
+    Tests if `x` is a torch device or not. Safe to call even if torch is not installed.
+    """
+    return False if not is_torch_available() else _is_torch_device(x)
+
+
+def _is_torch_dtype(x):
+    import torch
+
+    if isinstance(x, str):
+        if hasattr(torch, x):
+            x = getattr(torch, x)
+        else:
+            return False
+    return isinstance(x, torch.dtype)
+
+
+def is_torch_dtype(x):
+    """
+    Tests if `x` is a torch dtype or not. Safe to call even if torch is not installed.
+    """
+    return False if not is_torch_available() else _is_torch_dtype(x)
+
+
+def _is_tensorflow(x):
+    import tensorflow as tf
+
+    return isinstance(x, tf.Tensor)
+
+
+def is_tf_tensor(x):
+    """
+    Tests if `x` is a tensorflow tensor or not. Safe to call even if tensorflow is not installed.
+    """
+    return False if not is_tf_available() else _is_tensorflow(x)
+
+
+def _is_tf_symbolic_tensor(x):
+    import tensorflow as tf
+
+    # the `is_symbolic_tensor` predicate is only available starting with TF 2.14
+    if hasattr(tf, "is_symbolic_tensor"):
+        return tf.is_symbolic_tensor(x)
+    return isinstance(x, tf.Tensor)
+
+
+def is_tf_symbolic_tensor(x):
+    """
+    Tests if `x` is a tensorflow symbolic tensor or not (ie. not eager). Safe to call even if tensorflow is not
+    installed.
+    """
+    return False if not is_tf_available() else _is_tf_symbolic_tensor(x)
+
+
+def _is_jax(x):
+    import jax.numpy as jnp  # noqa: F811
+
+    return isinstance(x, jnp.ndarray)
+
+
+def is_jax_tensor(x):
+    """
+    Tests if `x` is a Jax tensor or not. Safe to call even if jax is not installed.
+    """
+    return False if not is_flax_available() else _is_jax(x)
+
+
+def _is_mlx(x):
+    import mlx.core as mx
+
+    return isinstance(x, mx.array)
+
+
+def is_mlx_array(x):
+    """
+    Tests if `x` is a mlx array or not. Safe to call even when mlx is not installed.
+    """
+    return False if not is_mlx_available() else _is_mlx(x)
+
+
+def to_py_obj(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
+    """
+    if isinstance(obj, (int, float)):
+        return obj
+    elif isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        try:
+            arr = np.array(obj)
+            if np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating):
+                return arr.tolist()
+        except Exception:
+            pass
+        return [to_py_obj(o) for o in obj]
+
+    framework_to_py_obj = {
+        "pt": lambda obj: obj.tolist(),
+        "tf": lambda obj: obj.numpy().tolist(),
+        "jax": lambda obj: np.asarray(obj).tolist(),
+        "np": lambda obj: obj.tolist(),
+    }
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_py_obj[framework](obj)
+
+    # tolist also works on 0d np arrays
+    if isinstance(obj, np.number):
+        return obj.tolist()
+    else:
+        return obj
+
+
+def to_numpy(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
+    """
+
+    framework_to_numpy = {
+        "pt": lambda obj: obj.detach().cpu().numpy(),
+        "tf": lambda obj: obj.numpy(),
+        "jax": lambda obj: np.asarray(obj),
+        "np": lambda obj: obj,
+    }
+
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_numpy(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return np.array(obj)
+
+    # This gives us a smart order to test the frameworks with the corresponding tests.
+    framework_to_test_func = _get_frameworks_and_test_func(obj)
+    for framework, test_func in framework_to_test_func.items():
+        if test_func(obj):
+            return framework_to_numpy[framework](obj)
+
+    return obj
+
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
+    before.
+
+    </Tip>
+    """
+
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        if is_torch_available():
+            if version.parse(get_torch_version()) >= version.parse("2.2"):
+                from torch.utils._pytree import register_pytree_node
+
+                register_pytree_node(
+                    cls,
+                    _model_output_flatten,
+                    partial(_model_output_unflatten, output_type=cls),
+                    serialized_type_name=f"{cls.__module__}.{cls.__name__}",
+                )
+            else:
+                from torch.utils._pytree import register_pytree_node
+
+                register_pytree_node(
+                    cls,
+                    _model_output_flatten,
+                    partial(_model_output_unflatten, output_type=cls),
+                )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Subclasses of ModelOutput must use the @dataclass decorator
+        # This check is done in __init__ because the @dataclass decorator operates after __init_subclass__
+        # issubclass() would return True for issubclass(ModelOutput, ModelOutput) when False is needed
+        # Just need to check that the current class is not ModelOutput
+        is_modeloutput_subclass = self.__class__ != ModelOutput
+
+        if is_modeloutput_subclass and not is_dataclass(self):
+            raise TypeError(
+                f"{self.__module__}.{self.__class__.__name__} is not a dataclass."
+                " This is a subclass of ModelOutput and so must use the @dataclass decorator."
+            )
+
+    def __post_init__(self):
+        """Check the ModelOutput dataclass.
+
+        Only occurs if @dataclass decorator has been used.
+        """
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+        if not all(field.default is None for field in class_fields[1:]):
+            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for idx, element in enumerate(iterator):
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        if idx == 0:
+                            # If we do not have an iterator of key/values, set it as attribute
+                            self[class_fields[0].name] = first_field
+                        else:
+                            # If we have a mixed iterator, raise an error
+                            raise ValueError(
+                                f"Cannot set key/value for {element}. It needs to be a tuple (key, value)."
+                            )
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def __reduce__(self):
+        if not is_dataclass(self):
+            return super().__reduce__()
+        callable, _args, *remaining = super().__reduce__()
+        args = tuple(getattr(self, field.name) for field in fields(self))
+        return callable, args, *remaining
+
+    def to_tuple(self) -> tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+if is_torch_available():
+    import torch.utils._pytree as _torch_pytree
+
+    def _model_output_flatten(output: ModelOutput) -> tuple[list[Any], "_torch_pytree.Context"]:
+        return list(output.values()), list(output.keys())
+
+    def _model_output_unflatten(
+        values: Iterable[Any],
+        context: "_torch_pytree.Context",
+        output_type=None,
+    ) -> ModelOutput:
+        return output_type(**dict(zip(context, values)))
+
+    if version.parse(get_torch_version()) >= version.parse("2.2"):
+        _torch_pytree.register_pytree_node(
+            ModelOutput,
+            _model_output_flatten,
+            partial(_model_output_unflatten, output_type=ModelOutput),
+            serialized_type_name=f"{ModelOutput.__module__}.{ModelOutput.__name__}",
+        )
+    else:
+        _torch_pytree._register_pytree_node(
+            ModelOutput,
+            _model_output_flatten,
+            partial(_model_output_unflatten, output_type=ModelOutput),
+        )
+
+
+class ExplicitEnum(str, Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the `padding` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+    IDE.
+    """
+
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the `return_tensors` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
+    tab-completion in an IDE.
+    """
+
+    PYTORCH = "pt"
+    TENSORFLOW = "tf"
+    NUMPY = "np"
+    JAX = "jax"
+    MLX = "mlx"
+
+
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
+    in the `fastcore` library.
+    """
+
+    def __init__(self, context_managers: list[ContextManager]):
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+
+    def __enter__(self):
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+
+    def __exit__(self, *args, **kwargs):
+        self.stack.__exit__(*args, **kwargs)
+
+
+def can_return_loss(model_class):
+    """
+    Check if a given model can return loss.
+
+    Args:
+        model_class (`type`): The class of the model.
+    """
+    framework = infer_framework(model_class)
+    if framework == "tf":
+        signature = inspect.signature(model_class.call)  # TensorFlow models
+    elif framework == "pt":
+        signature = inspect.signature(model_class.forward)  # PyTorch models
+    else:
+        signature = inspect.signature(model_class.__call__)  # Flax models
+
+    for p in signature.parameters:
+        if p == "return_loss" and signature.parameters[p].default is True:
+            return True
+
+    return False
+
+
+def find_labels(model_class):
+    """
+    Find the labels used by a given model.
+
+    Args:
+        model_class (`type`): The class of the model.
+    """
+    model_name = model_class.__name__
+    framework = infer_framework(model_class)
+    if framework == "tf":
+        signature = inspect.signature(model_class.call)  # TensorFlow models
+    elif framework == "pt":
+        signature = inspect.signature(model_class.forward)  # PyTorch models
+    else:
+        signature = inspect.signature(model_class.__call__)  # Flax models
+
+    if "QuestionAnswering" in model_name:
+        return [p for p in signature.parameters if "label" in p or p in ("start_positions", "end_positions")]
+    else:
+        return [p for p in signature.parameters if "label" in p]
+
+
+def flatten_dict(d: MutableMapping, parent_key: str = "", delimiter: str = "."):
+    """Flatten a nested dict into a single level dict."""
+
+    def _flatten_dict(d, parent_key="", delimiter="."):
+        for k, v in d.items():
+            key = str(parent_key) + delimiter + str(k) if parent_key else k
+            if v and isinstance(v, MutableMapping):
+                yield from flatten_dict(v, key, delimiter=delimiter).items()
+            else:
+                yield key, v
+
+    return dict(_flatten_dict(d, parent_key, delimiter))
+
+
+@contextmanager
+def working_or_temp_dir(working_dir, use_temp_dir: bool = False):
+    if use_temp_dir:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            yield tmp_dir
+    else:
+        yield working_dir
+
+
+def transpose(array, axes=None):
+    """
+    Framework-agnostic version of `numpy.transpose` that will work on torch/TensorFlow/Jax tensors as well as NumPy
+    arrays.
+    """
+    if is_numpy_array(array):
+        return np.transpose(array, axes=axes)
+    elif is_torch_tensor(array):
+        return array.T if axes is None else array.permute(*axes)
+    elif is_tf_tensor(array):
+        import tensorflow as tf
+
+        return tf.transpose(array, perm=axes)
+    elif is_jax_tensor(array):
+        import jax.numpy as jnp
+
+        return jnp.transpose(array, axes=axes)
+    else:
+        raise ValueError(f"Type not supported for transpose: {type(array)}.")
+
+
+def reshape(array, newshape):
+    """
+    Framework-agnostic version of `numpy.reshape` that will work on torch/TensorFlow/Jax tensors as well as NumPy
+    arrays.
+    """
+    if is_numpy_array(array):
+        return np.reshape(array, newshape)
+    elif is_torch_tensor(array):
+        return array.reshape(*newshape)
+    elif is_tf_tensor(array):
+        import tensorflow as tf
+
+        return tf.reshape(array, newshape)
+    elif is_jax_tensor(array):
+        import jax.numpy as jnp
+
+        return jnp.reshape(array, newshape)
+    else:
+        raise ValueError(f"Type not supported for reshape: {type(array)}.")
+
+
+def squeeze(array, axis=None):
+    """
+    Framework-agnostic version of `numpy.squeeze` that will work on torch/TensorFlow/Jax tensors as well as NumPy
+    arrays.
+    """
+    if is_numpy_array(array):
+        return np.squeeze(array, axis=axis)
+    elif is_torch_tensor(array):
+        return array.squeeze() if axis is None else array.squeeze(dim=axis)
+    elif is_tf_tensor(array):
+        import tensorflow as tf
+
+        return tf.squeeze(array, axis=axis)
+    elif is_jax_tensor(array):
+        import jax.numpy as jnp
+
+        return jnp.squeeze(array, axis=axis)
+    else:
+        raise ValueError(f"Type not supported for squeeze: {type(array)}.")
+
+
+def expand_dims(array, axis):
+    """
+    Framework-agnostic version of `numpy.expand_dims` that will work on torch/TensorFlow/Jax tensors as well as NumPy
+    arrays.
+    """
+    if is_numpy_array(array):
+        return np.expand_dims(array, axis)
+    elif is_torch_tensor(array):
+        return array.unsqueeze(dim=axis)
+    elif is_tf_tensor(array):
+        import tensorflow as tf
+
+        return tf.expand_dims(array, axis=axis)
+    elif is_jax_tensor(array):
+        import jax.numpy as jnp
+
+        return jnp.expand_dims(array, axis=axis)
+    else:
+        raise ValueError(f"Type not supported for expand_dims: {type(array)}.")
+
+
+def tensor_size(array):
+    """
+    Framework-agnostic version of `numpy.size` that will work on torch/TensorFlow/Jax tensors as well as NumPy arrays.
+    """
+    if is_numpy_array(array):
+        return np.size(array)
+    elif is_torch_tensor(array):
+        return array.numel()
+    elif is_tf_tensor(array):
+        import tensorflow as tf
+
+        return tf.size(array)
+    elif is_jax_tensor(array):
+        return array.size
+    else:
+        raise ValueError(f"Type not supported for tensor_size: {type(array)}.")
+
+
+def add_model_info_to_auto_map(auto_map, repo_id):
+    """
+    Adds the information of the repo_id to a given auto map.
+    """
+    for key, value in auto_map.items():
+        if isinstance(value, (tuple, list)):
+            auto_map[key] = [f"{repo_id}--{v}" if (v is not None and "--" not in v) else v for v in value]
+        elif value is not None and "--" not in value:
+            auto_map[key] = f"{repo_id}--{value}"
+
+    return auto_map
+
+
+def add_model_info_to_custom_pipelines(custom_pipeline, repo_id):
+    """
+    Adds the information of the repo_id to a given custom pipeline.
+    """
+    # {custom_pipelines : {task: {"impl": "path.to.task"},...} }
+    for task in custom_pipeline.keys():
+        if "impl" in custom_pipeline[task]:
+            module = custom_pipeline[task]["impl"]
+            if "--" not in module:
+                custom_pipeline[task]["impl"] = f"{repo_id}--{module}"
+    return custom_pipeline
+
+
+def infer_framework(model_class):
+    """
+    Infers the framework of a given model without using isinstance(), because we cannot guarantee that the relevant
+    classes are imported or available.
+    """
+    for base_class in inspect.getmro(model_class):
+        module = base_class.__module__
+        name = base_class.__name__
+        if module.startswith("tensorflow") or module.startswith("keras") or name == "TFPreTrainedModel":
+            return "tf"
+        elif module.startswith("torch") or name == "PreTrainedModel":
+            return "pt"
+        elif module.startswith("flax") or module.startswith("jax") or name == "FlaxPreTrainedModel":
+            return "flax"
+    else:
+        raise TypeError(f"Could not infer framework from class {model_class}.")
+
+
+def torch_int(x):
+    """
+    Casts an input to a torch int64 tensor if we are in a tracing context, otherwise to a Python int.
+    """
+    if not is_torch_available():
+        return int(x)
+
+    import torch
+
+    return x.to(torch.int64) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
+
+
+def torch_float(x):
+    """
+    Casts an input to a torch float32 tensor if we are in a tracing context, otherwise to a Python float.
+    """
+    if not is_torch_available():
+        return int(x)
+
+    import torch
+
+    return x.to(torch.float32) if torch.jit.is_tracing() and isinstance(x, torch.Tensor) else int(x)
+
+
+def filter_out_non_signature_kwargs(extra: Optional[list] = None):
+    """
+    Decorator to filter out named arguments that are not in the function signature.
+
+    This decorator ensures that only the keyword arguments that match the function's signature, or are specified in the
+    `extra` list, are passed to the function. Any additional keyword arguments are filtered out and a warning is issued.
+
+    Parameters:
+        extra (`Optional[list]`, *optional*):
+            A list of extra keyword argument names that are allowed even if they are not in the function's signature.
+
+    Returns:
+        Callable:
+            A decorator that wraps the function and filters out invalid keyword arguments.
+
+    Example usage:
+
+        ```python
+        @filter_out_non_signature_kwargs(extra=["allowed_extra_arg"])
+        def my_function(arg1, arg2, **kwargs):
+            print(arg1, arg2, kwargs)
+
+        my_function(arg1=1, arg2=2, allowed_extra_arg=3, invalid_arg=4)
+        # This will print: 1 2 {"allowed_extra_arg": 3}
+        # And issue a warning: "The following named arguments are not valid for `my_function` and were ignored: 'invalid_arg'"
+        ```
+    """
+    extra = extra or []
+    extra_params_to_pass = set(extra)
+
+    def decorator(func):
+        sig = inspect.signature(func)
+        function_named_args = set(sig.parameters.keys())
+        valid_kwargs_to_pass = function_named_args.union(extra_params_to_pass)
+
+        # Required for better warning message
+        is_instance_method = "self" in function_named_args
+        is_class_method = "cls" in function_named_args
+
+        # Mark function as decorated
+        func._filter_out_non_signature_kwargs = True
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            valid_kwargs = {}
+            invalid_kwargs = {}
+
+            for k, v in kwargs.items():
+                if k in valid_kwargs_to_pass:
+                    valid_kwargs[k] = v
+                else:
+                    invalid_kwargs[k] = v
+
+            if invalid_kwargs:
+                invalid_kwargs_names = [f"'{k}'" for k in invalid_kwargs.keys()]
+                invalid_kwargs_names = ", ".join(invalid_kwargs_names)
+
+                # Get the class name for better warning message
+                if is_instance_method:
+                    cls_prefix = args[0].__class__.__name__ + "."
+                elif is_class_method:
+                    cls_prefix = args[0].__name__ + "."
+                else:
+                    cls_prefix = ""
+
+                warnings.warn(
+                    f"The following named arguments are not valid for `{cls_prefix}{func.__name__}`"
+                    f" and were ignored: {invalid_kwargs_names}",
+                    UserWarning,
+                    stacklevel=2,
+                )
+
+            return func(*args, **valid_kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+class LossKwargs(TypedDict, total=False):
+    """
+    Keyword arguments to be passed to the loss function
+
+    Attributes:
+        num_items_in_batch (`int`, *optional*):
+            Number of items in the batch. It is recommended to pass it when
+            you are doing gradient accumulation.
+    """
+
+    num_items_in_batch: Optional[int]
+
+
+def is_timm_config_dict(config_dict: dict[str, Any]) -> bool:
+    """Checks whether a config dict is a timm config dict."""
+    return "pretrained_cfg" in config_dict
+
+
+def is_timm_local_checkpoint(pretrained_model_path: str) -> bool:
+    """
+    Checks whether a checkpoint is a timm model checkpoint.
+    """
+    if pretrained_model_path is None:
+        return False
+
+    # in case it's Path, not str
+    pretrained_model_path = str(pretrained_model_path)
+
+    is_file = os.path.isfile(pretrained_model_path)
+    is_dir = os.path.isdir(pretrained_model_path)
+
+    # pretrained_model_path is a file
+    if is_file and pretrained_model_path.endswith(".json"):
+        with open(pretrained_model_path) as f:
+            config_dict = json.load(f)
+        return is_timm_config_dict(config_dict)
+
+    # pretrained_model_path is a directory with a config.json
+    if is_dir and os.path.exists(os.path.join(pretrained_model_path, "config.json")):
+        with open(os.path.join(pretrained_model_path, "config.json")) as f:
+            config_dict = json.load(f)
+        return is_timm_config_dict(config_dict)
+
+    return False
+
+
+def set_attribute_for_modules(module: "torch.nn.Module", key: str, value: Any):
+    """
+    Set a value to a module and all submodules.
+    """
+    setattr(module, key, value)
+    for submodule in module.children():
+        set_attribute_for_modules(submodule, key, value)
+
+
+def del_attribute_from_modules(module: "torch.nn.Module", key: str):
+    """
+    Delete a value from a module and all submodules.
+    """
+    # because we might remove it previously in case it's a shared module, e.g. activation function
+    if hasattr(module, key):
+        delattr(module, key)
+
+    for submodule in module.children():
+        del_attribute_from_modules(submodule, key)
+
+
+def can_return_tuple(func):
+    """
+    Decorator to wrap model method, to call output.to_tuple() if return_dict=False passed as a kwarg or
+    use_return_dict=False is set in the config.
+
+    Note:
+        output.to_tuple() convert output to tuple skipping all `None` values.
+    """
+
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        is_requested_to_return_tuple = kwargs.pop("return_dict", True) is False
+        is_configured_to_return_tuple = self.config.use_return_dict is False if hasattr(self, "config") else False
+
+        # The following allows to convert output to tuple ONLY on top level forward call,
+        # while internal modules of the model will return Output objects
+        # to be able to use name-based attribute access in modeling code.
+
+        # We will check if we are on top level module, if so, turn off to tuple conversion for all
+        # underling calls.
+        is_top_level_module = getattr(self, "_is_top_level_module", True)
+        if is_configured_to_return_tuple and is_top_level_module:
+            set_attribute_for_modules(self, "_is_top_level_module", False)
+
+        try:
+            output = func(self, *args, **kwargs)
+            if is_requested_to_return_tuple or (is_configured_to_return_tuple and is_top_level_module):
+                output = output.to_tuple()
+        finally:
+            # Remove the flag after the model forward call is finished.
+            if is_configured_to_return_tuple and is_top_level_module:
+                del_attribute_from_modules(self, "_is_top_level_module")
+
+        return output
+
+    return wrapper
diff --git a/docs/transformers/build/lib/transformers/utils/hp_naming.py b/docs/transformers/build/lib/transformers/utils/hp_naming.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c5cb5259f8452b09cc910aee1fec7f1ba438c8
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/hp_naming.py
@@ -0,0 +1,162 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import re
+
+
+class TrialShortNamer:
+    PREFIX = "hp"
+    DEFAULTS = {}
+    NAMING_INFO = None
+
+    @classmethod
+    def set_defaults(cls, prefix, defaults):
+        cls.PREFIX = prefix
+        cls.DEFAULTS = defaults
+        cls.build_naming_info()
+
+    @staticmethod
+    def shortname_for_word(info, word):
+        if len(word) == 0:
+            return ""
+        short_word = None
+        if any(char.isdigit() for char in word):
+            raise Exception(f"Parameters should not contain numbers: '{word}' contains a number")
+        if word in info["short_word"]:
+            return info["short_word"][word]
+        for prefix_len in range(1, len(word) + 1):
+            prefix = word[:prefix_len]
+            if prefix in info["reverse_short_word"]:
+                continue
+            else:
+                short_word = prefix
+                break
+
+        if short_word is None:
+            # Paranoid fallback
+            def int_to_alphabetic(integer):
+                s = ""
+                while integer != 0:
+                    s = chr(ord("A") + integer % 10) + s
+                    integer //= 10
+                return s
+
+            i = 0
+            while True:
+                sword = word + "#" + int_to_alphabetic(i)
+                if sword in info["reverse_short_word"]:
+                    continue
+                else:
+                    short_word = sword
+                    break
+
+        info["short_word"][word] = short_word
+        info["reverse_short_word"][short_word] = word
+        return short_word
+
+    @staticmethod
+    def shortname_for_key(info, param_name):
+        words = param_name.split("_")
+
+        shortname_parts = [TrialShortNamer.shortname_for_word(info, word) for word in words]
+
+        # We try to create a separatorless short name, but if there is a collision we have to fallback
+        # to a separated short name
+        separators = ["", "_"]
+
+        for separator in separators:
+            shortname = separator.join(shortname_parts)
+            if shortname not in info["reverse_short_param"]:
+                info["short_param"][param_name] = shortname
+                info["reverse_short_param"][shortname] = param_name
+                return shortname
+
+        return param_name
+
+    @staticmethod
+    def add_new_param_name(info, param_name):
+        short_name = TrialShortNamer.shortname_for_key(info, param_name)
+        info["short_param"][param_name] = short_name
+        info["reverse_short_param"][short_name] = param_name
+
+    @classmethod
+    def build_naming_info(cls):
+        if cls.NAMING_INFO is not None:
+            return
+
+        info = {
+            "short_word": {},
+            "reverse_short_word": {},
+            "short_param": {},
+            "reverse_short_param": {},
+        }
+
+        field_keys = list(cls.DEFAULTS.keys())
+
+        for k in field_keys:
+            cls.add_new_param_name(info, k)
+
+        cls.NAMING_INFO = info
+
+    @classmethod
+    def shortname(cls, params):
+        cls.build_naming_info()
+        assert cls.PREFIX is not None
+        name = [copy.copy(cls.PREFIX)]
+
+        for k, v in params.items():
+            if k not in cls.DEFAULTS:
+                raise Exception(f"You should provide a default value for the param name {k} with value {v}")
+            if v == cls.DEFAULTS[k]:
+                # The default value is not added to the name
+                continue
+
+            key = cls.NAMING_INFO["short_param"][k]
+
+            if isinstance(v, bool):
+                v = 1 if v else 0
+
+            sep = "" if isinstance(v, (int, float)) else "-"
+            e = f"{key}{sep}{v}"
+            name.append(e)
+
+        return "_".join(name)
+
+    @classmethod
+    def parse_repr(cls, repr):
+        repr = repr[len(cls.PREFIX) + 1 :]
+        if repr == "":
+            values = []
+        else:
+            values = repr.split("_")
+
+        parameters = {}
+
+        for value in values:
+            if "-" in value:
+                p_k, p_v = value.split("-")
+            else:
+                p_k = re.sub("[0-9.]", "", value)
+                p_v = float(re.sub("[^0-9.]", "", value))
+
+            key = cls.NAMING_INFO["reverse_short_param"][p_k]
+
+            parameters[key] = p_v
+
+        for k in cls.DEFAULTS:
+            if k not in parameters:
+                parameters[k] = cls.DEFAULTS[k]
+
+        return parameters
diff --git a/docs/transformers/build/lib/transformers/utils/hub.py b/docs/transformers/build/lib/transformers/utils/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a5a174baf44d4f0c7c2998f36054c9d6855fad
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/hub.py
@@ -0,0 +1,1188 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Hub utilities: utilities related to download and cache models
+"""
+
+import json
+import os
+import re
+import sys
+import tempfile
+import warnings
+from concurrent import futures
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import urlparse
+from uuid import uuid4
+
+import huggingface_hub
+import requests
+from huggingface_hub import (
+    _CACHED_NO_EXIST,
+    CommitOperationAdd,
+    ModelCard,
+    ModelCardData,
+    constants,
+    create_branch,
+    create_commit,
+    create_repo,
+    hf_hub_download,
+    hf_hub_url,
+    list_repo_tree,
+    snapshot_download,
+    try_to_load_from_cache,
+)
+from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    GatedRepoError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    OfflineModeIsEnabled,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    build_hf_headers,
+    get_session,
+    hf_raise_for_status,
+    send_telemetry,
+)
+from requests.exceptions import HTTPError
+
+from . import __version__, logging
+from .generic import working_or_temp_dir
+from .import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    _tf_version,
+    _torch_version,
+    is_tf_available,
+    is_torch_available,
+    is_training_run_on_sagemaker,
+)
+
+
+LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE = "chat_template.json"
+CHAT_TEMPLATE_FILE = "chat_template.jinja"
+CHAT_TEMPLATE_DIR = "additional_chat_templates"
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+_is_offline_mode = huggingface_hub.constants.HF_HUB_OFFLINE
+
+
+def is_offline_mode():
+    return _is_offline_mode
+
+
+torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+default_cache_path = constants.default_cache_path
+
+# Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility.
+# The best way to set the cache path is with the environment variable HF_HOME. For more details, checkout this
+# documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables.
+#
+# In code, use `HF_HUB_CACHE` as the default cache path. This variable is set by the library and is guaranteed
+# to be set to the right value.
+#
+# TODO: clean this for v5?
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", constants.HF_HUB_CACHE)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(constants.HF_HOME, "modules"))
+TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
+SESSION_ID = uuid4().hex
+
+# Add deprecation warning for old environment variables.
+for key in ("PYTORCH_PRETRAINED_BERT_CACHE", "PYTORCH_TRANSFORMERS_CACHE", "TRANSFORMERS_CACHE"):
+    if os.getenv(key) is not None:
+        warnings.warn(
+            f"Using `{key}` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.",
+            FutureWarning,
+        )
+
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
+
+_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
+_default_endpoint = "https://hub-ci.huggingface.co" if _staging_mode else "https://huggingface.co"
+
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = _default_endpoint
+if os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None) is not None:
+    warnings.warn(
+        "Using the environment variable `HUGGINGFACE_CO_RESOLVE_ENDPOINT` is deprecated and will be removed in "
+        "Transformers v5. Use `HF_ENDPOINT` instead.",
+        FutureWarning,
+    )
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", None)
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", HUGGINGFACE_CO_RESOLVE_ENDPOINT)
+HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
+HUGGINGFACE_CO_EXAMPLES_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/examples"
+
+
+def _get_cache_file_to_return(
+    path_or_repo_id: str, full_filename: str, cache_dir: Union[str, Path, None] = None, revision: Optional[str] = None
+):
+    # We try to see if we have a cached version (not up to date):
+    resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir, revision=revision)
+    if resolved_file is not None and resolved_file != _CACHED_NO_EXIST:
+        return resolved_file
+    return None
+
+
+def list_repo_templates(
+    repo_id: str,
+    *,
+    local_files_only: bool,
+    revision: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+) -> list[str]:
+    """List template files from a repo.
+
+    A template is a jinja file located under the `additional_chat_templates/` folder.
+    If working in offline mode or if internet is down, the method will list jinja template from the local cache - if any.
+    """
+
+    if not local_files_only:
+        try:
+            return [
+                entry.path.removeprefix(f"{CHAT_TEMPLATE_DIR}/")
+                for entry in list_repo_tree(
+                    repo_id=repo_id, revision=revision, path_in_repo=CHAT_TEMPLATE_DIR, recursive=False
+                )
+                if entry.path.endswith(".jinja")
+            ]
+        except (GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError):
+            raise  # valid errors => do not catch
+        except (ConnectionError, HTTPError):
+            pass  # offline mode, internet down, etc. => try local files
+
+    # check local files
+    try:
+        snapshot_dir = snapshot_download(
+            repo_id=repo_id, revision=revision, cache_dir=cache_dir, local_files_only=True
+        )
+    except LocalEntryNotFoundError:  # No local repo means no local files
+        return []
+    templates_dir = Path(snapshot_dir, CHAT_TEMPLATE_DIR)
+    if not templates_dir.is_dir():
+        return []
+    return [entry.stem for entry in templates_dir.iterdir() if entry.is_file() and entry.name.endswith(".jinja")]
+
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+
+
+def define_sagemaker_information():
+    try:
+        instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
+        dlc_container_used = instance_data["Image"]
+        dlc_tag = instance_data["Image"].split(":")[1]
+    except Exception:
+        dlc_container_used = None
+        dlc_tag = None
+
+    sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}"))
+    runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False
+    account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None
+
+    sagemaker_object = {
+        "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None),
+        "sm_region": os.getenv("AWS_REGION", None),
+        "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0),
+        "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0),
+        "sm_distributed_training": runs_distributed_training,
+        "sm_deep_learning_container": dlc_container_used,
+        "sm_deep_learning_container_tag": dlc_tag,
+        "sm_account_id": account_id,
+    }
+    return sagemaker_object
+
+
+def http_user_agent(user_agent: Union[dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if is_torch_available():
+        ua += f"; torch/{_torch_version}"
+    if is_tf_available():
+        ua += f"; tensorflow/{_tf_version}"
+    if constants.HF_HUB_DISABLE_TELEMETRY:
+        return ua + "; telemetry/off"
+    if is_training_run_on_sagemaker():
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())
+    # CI will set this value to True
+    if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]) -> Optional[str]:
+    """
+    Extracts the commit hash from a resolved filename toward a cache file.
+    """
+    if resolved_file is None or commit_hash is not None:
+        return commit_hash
+    resolved_file = str(Path(resolved_file).as_posix())
+    search = re.search(r"snapshots/([^/]+)/", resolved_file)
+    if search is None:
+        return None
+    commit_hash = search.groups()[0]
+    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
+
+
+def cached_file(
+    path_or_repo_id: Union[str, os.PathLike],
+    filename: str,
+    **kwargs,
+) -> Optional[str]:
+    """
+    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+
+    Args:
+        path_or_repo_id (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filename (`str`):
+            The name of the file to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
+
+    <Tip>
+
+    Passing `token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
+
+    Examples:
+
+    ```python
+    # Download a model weight from the Hub and cache it.
+    model_weights_file = cached_file("google-bert/bert-base-uncased", "pytorch_model.bin")
+    ```
+    """
+    file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs)
+    file = file[0] if file is not None else file
+    return file
+
+
+def cached_files(
+    path_or_repo_id: Union[str, os.PathLike],
+    filenames: list[str],
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    repo_type: Optional[str] = None,
+    user_agent: Optional[Union[str, dict[str, str]]] = None,
+    _raise_exceptions_for_gated_repo: bool = True,
+    _raise_exceptions_for_missing_entries: bool = True,
+    _raise_exceptions_for_connection_errors: bool = True,
+    _commit_hash: Optional[str] = None,
+    **deprecated_kwargs,
+) -> Optional[str]:
+    """
+    Tries to locate several files in a local folder and repo, downloads and cache them if necessary.
+
+    Args:
+        path_or_repo_id (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filenames (`List[str]`):
+            The name of all the files to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+        repo_type (`str`, *optional*):
+            Specify the repo type (useful when downloading from a space for instance).
+
+    Private args:
+        _raise_exceptions_for_gated_repo (`bool`):
+            if False, do not raise an exception for gated repo error but return None.
+        _raise_exceptions_for_missing_entries (`bool`):
+            if False, do not raise an exception for missing entries but return None.
+        _raise_exceptions_for_connection_errors (`bool`):
+            if False, do not raise an exception for connection errors but return None.
+        _commit_hash (`str`, *optional*):
+            passed when we are chaining several calls to various files (e.g. when loading a tokenizer or
+            a pipeline). If files are cached for this commit hash, avoid calls to head and get from the cache.
+
+    <Tip>
+
+    Passing `token=True` is required when you want to use a private model.
+
+    </Tip>
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
+
+    Examples:
+
+    ```python
+    # Download a model weight from the Hub and cache it.
+    model_weights_file = cached_file("google-bert/bert-base-uncased", "pytorch_model.bin")
+    ```
+    """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+    if subfolder is None:
+        subfolder = ""
+
+    # Add folder to filenames
+    full_filenames = [os.path.join(subfolder, file) for file in filenames]
+
+    path_or_repo_id = str(path_or_repo_id)
+    existing_files = []
+    for filename in full_filenames:
+        if os.path.isdir(path_or_repo_id):
+            resolved_file = os.path.join(path_or_repo_id, filename)
+            if not os.path.isfile(resolved_file):
+                if _raise_exceptions_for_missing_entries and filename != os.path.join(subfolder, "config.json"):
+                    revision_ = "main" if revision is None else revision
+                    raise OSError(
+                        f"{path_or_repo_id} does not appear to have a file named {filename}. Checkout "
+                        f"'https://huggingface.co/{path_or_repo_id}/tree/{revision_}' for available files."
+                    )
+                else:
+                    return None
+            existing_files.append(resolved_file)
+
+    # All files exist
+    if len(existing_files) == len(full_filenames):
+        return existing_files
+
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    existing_files = []
+    file_counter = 0
+    if _commit_hash is not None and not force_download:
+        for filename in full_filenames:
+            # If the file is cached under that commit hash, we return it directly.
+            resolved_file = try_to_load_from_cache(
+                path_or_repo_id, filename, cache_dir=cache_dir, revision=_commit_hash, repo_type=repo_type
+            )
+            if resolved_file is not None:
+                if resolved_file is not _CACHED_NO_EXIST:
+                    file_counter += 1
+                    existing_files.append(resolved_file)
+                elif not _raise_exceptions_for_missing_entries:
+                    file_counter += 1
+                else:
+                    raise OSError(f"Could not locate {filename} inside {path_or_repo_id}.")
+
+    # Either all the files were found, or some were _CACHED_NO_EXIST but we do not raise for missing entries
+    if file_counter == len(full_filenames):
+        return existing_files if len(existing_files) > 0 else None
+
+    user_agent = http_user_agent(user_agent)
+    # download the files if needed
+    try:
+        if len(full_filenames) == 1:
+            # This is slightly better for only 1 file
+            hf_hub_download(
+                path_or_repo_id,
+                filenames[0],
+                subfolder=None if len(subfolder) == 0 else subfolder,
+                repo_type=repo_type,
+                revision=revision,
+                cache_dir=cache_dir,
+                user_agent=user_agent,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        else:
+            snapshot_download(
+                path_or_repo_id,
+                allow_patterns=full_filenames,
+                repo_type=repo_type,
+                revision=revision,
+                cache_dir=cache_dir,
+                user_agent=user_agent,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+
+    except Exception as e:
+        # We cannot recover from them
+        if isinstance(e, RepositoryNotFoundError) and not isinstance(e, GatedRepoError):
+            raise OSError(
+                f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
+                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token "
+                "having permission to this repo either by logging in with `huggingface-cli login` or by passing "
+                "`token=<your_token>`"
+            ) from e
+        elif isinstance(e, RevisionNotFoundError):
+            raise OSError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists "
+                "for this model name. Check the model page at "
+                f"'https://huggingface.co/{path_or_repo_id}' for available revisions."
+            ) from e
+        elif isinstance(e, PermissionError):
+            raise OSError(
+                f"PermissionError at {e.filename} when downloading {path_or_repo_id}. "
+                "Check cache directory permissions. Common causes: 1) another user is downloading the same model (please wait); "
+                "2) a previous download was canceled and the lock file needs manual removal."
+            ) from e
+
+        # Now we try to recover if we can find all files correctly in the cache
+        resolved_files = [
+            _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) for filename in full_filenames
+        ]
+        if all(file is not None for file in resolved_files):
+            return resolved_files
+
+        # Raise based on the flags. Note that we will raise for missing entries at the very end, even when
+        # not entering this Except block, as it may also happen when `snapshot_download` does not raise
+        if isinstance(e, GatedRepoError):
+            if not _raise_exceptions_for_gated_repo:
+                return None
+            raise OSError(
+                "You are trying to access a gated repo.\nMake sure to have access to it at "
+                f"https://huggingface.co/{path_or_repo_id}.\n{str(e)}"
+            ) from e
+        elif isinstance(e, LocalEntryNotFoundError):
+            if not _raise_exceptions_for_connection_errors:
+                return None
+            # Here we only raise if both flags for missing entry and connection errors are True (because it can be raised
+            # even when `local_files_only` is True, in which case raising for connections errors only would not make sense)
+            elif _raise_exceptions_for_missing_entries:
+                raise OSError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load the files, and couldn't find them in the"
+                    f" cached files.\nCheckout your internet connection or see how to run the library in offline mode at"
+                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+                ) from e
+        # snapshot_download will not raise EntryNotFoundError, but hf_hub_download can. If this is the case, it will be treated
+        # later on anyway and re-raised if needed
+        elif isinstance(e, HTTPError) and not isinstance(e, EntryNotFoundError):
+            if not _raise_exceptions_for_connection_errors:
+                return None
+            raise OSError(f"There was a specific connection error when trying to load {path_or_repo_id}:\n{e}")
+        # Any other Exception type should now be re-raised, in order to provide helpful error messages and break the execution flow
+        # (EntryNotFoundError will be treated outside this block and correctly re-raised if needed)
+        elif not isinstance(e, EntryNotFoundError):
+            raise e
+
+    resolved_files = [
+        _get_cache_file_to_return(path_or_repo_id, filename, cache_dir, revision) for filename in full_filenames
+    ]
+    # If there are any missing file and the flag is active, raise
+    if any(file is None for file in resolved_files) and _raise_exceptions_for_missing_entries:
+        missing_entries = [original for original, resolved in zip(full_filenames, resolved_files) if resolved is None]
+        # Last escape
+        if len(resolved_files) == 1 and missing_entries[0] == os.path.join(subfolder, "config.json"):
+            return None
+        # Now we raise for missing entries
+        revision_ = "main" if revision is None else revision
+        msg = (
+            f"a file named {missing_entries[0]}" if len(missing_entries) == 1 else f"files named {(*missing_entries,)}"
+        )
+        raise EnvironmentError(
+            f"{path_or_repo_id} does not appear to have {msg}. Checkout 'https://huggingface.co/{path_or_repo_id}/tree/{revision_}'"
+            "for available files."
+        )
+
+    # Remove potential missing entries (we can silently remove them at this point based on the flags)
+    resolved_files = [file for file in resolved_files if file is not None]
+    # Return `None` if the list is empty, coherent with other Exception when the flag is not active
+    resolved_files = None if len(resolved_files) == 0 else resolved_files
+
+    return resolved_files
+
+
+def download_url(url, proxies=None):
+    """
+    Downloads a given url in a temporary file. This function is not safe to use in multiple processes. Its only use is
+    for deprecated behavior allowing to download config/models with a single url instead of using the Hub.
+
+    Args:
+        url (`str`): The url of the file to download.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+
+    Returns:
+        `str`: The location of the temporary file where the url was downloaded.
+    """
+    warnings.warn(
+        f"Using `from_pretrained` with the url of a file (here {url}) is deprecated and won't be possible anymore in"
+        " v5 of Transformers. You should host your file on the Hub (hf.co) instead and use the repository ID. Note"
+        " that this is not compatible with the caching system (your file will be downloaded at each execution) or"
+        " multiple processes (each process will download the file in a different temporary file).",
+        FutureWarning,
+    )
+    tmp_fd, tmp_file = tempfile.mkstemp()
+    with os.fdopen(tmp_fd, "wb") as f:
+        http_get(url, f, proxies=proxies)
+    return tmp_file
+
+
+def has_file(
+    path_or_repo: Union[str, os.PathLike],
+    filename: str,
+    revision: Optional[str] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    *,
+    local_files_only: bool = False,
+    cache_dir: Union[str, Path, None] = None,
+    repo_type: Optional[str] = None,
+    **deprecated_kwargs,
+):
+    """
+    Checks if a repo contains a given file without downloading it. Works for remote repos and local folders.
+
+    If offline mode is enabled, checks if the file exists in the cache.
+
+    <Tip warning={false}>
+
+    This function will raise an error if the repository `path_or_repo` is not valid or if `revision` does not exist for
+    this repo, but will return False for regular connection errors.
+
+    </Tip>
+    """
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    # If path to local directory, check if the file exists
+    if os.path.isdir(path_or_repo):
+        return os.path.isfile(os.path.join(path_or_repo, filename))
+
+    # Else it's a repo => let's check if the file exists in local cache or on the Hub
+
+    # Check if file exists in cache
+    # This information might be outdated so it's best to also make a HEAD call (if allowed).
+    cached_path = try_to_load_from_cache(
+        repo_id=path_or_repo,
+        filename=filename,
+        revision=revision,
+        repo_type=repo_type,
+        cache_dir=cache_dir,
+    )
+    has_file_in_cache = isinstance(cached_path, str)
+
+    # If local_files_only, don't try the HEAD call
+    if local_files_only:
+        return has_file_in_cache
+
+    # Check if the file exists
+    try:
+        response = get_session().head(
+            hf_hub_url(path_or_repo, filename=filename, revision=revision, repo_type=repo_type),
+            headers=build_hf_headers(token=token, user_agent=http_user_agent()),
+            allow_redirects=False,
+            proxies=proxies,
+            timeout=10,
+        )
+    except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+        # Actually raise for those subclasses of ConnectionError
+        raise
+    except (
+        requests.exceptions.ConnectionError,
+        requests.exceptions.Timeout,
+        OfflineModeIsEnabled,
+    ):
+        return has_file_in_cache
+
+    try:
+        hf_raise_for_status(response)
+        return True
+    except GatedRepoError as e:
+        logger.error(e)
+        raise OSError(
+            f"{path_or_repo} is a gated repository. Make sure to request access at "
+            f"https://huggingface.co/{path_or_repo} and pass a token having permission to this repo either by "
+            "logging in with `huggingface-cli login` or by passing `token=<your_token>`."
+        ) from e
+    except RepositoryNotFoundError as e:
+        logger.error(e)
+        raise OSError(f"{path_or_repo} is not a local folder or a valid repository name on 'https://hf.co'.") from e
+    except RevisionNotFoundError as e:
+        logger.error(e)
+        raise OSError(
+            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for this "
+            f"model name. Check the model page at 'https://huggingface.co/{path_or_repo}' for available revisions."
+        ) from e
+    except EntryNotFoundError:
+        return False  # File does not exist
+    except requests.HTTPError:
+        # Any authentication/authorization error will be caught here => default to cache
+        return has_file_in_cache
+
+
+class PushToHubMixin:
+    """
+    A Mixin containing the functionality to push a model or tokenizer to the hub.
+    """
+
+    def _create_repo(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        repo_url: Optional[str] = None,
+        organization: Optional[str] = None,
+    ) -> str:
+        """
+        Create the repo if needed, cleans up repo_id with deprecated kwargs `repo_url` and `organization`, retrieves
+        the token.
+        """
+        if repo_url is not None:
+            warnings.warn(
+                "The `repo_url` argument is deprecated and will be removed in v5 of Transformers. Use `repo_id` "
+                "instead."
+            )
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_url` are both specified. Please set only the argument `repo_id`."
+                )
+            repo_id = repo_url.replace(f"{HUGGINGFACE_CO_RESOLVE_ENDPOINT}/", "")
+        if organization is not None:
+            warnings.warn(
+                "The `organization` argument is deprecated and will be removed in v5 of Transformers. Set your "
+                "organization directly in the `repo_id` passed instead (`repo_id={organization}/{model_id}`)."
+            )
+            if not repo_id.startswith(organization):
+                if "/" in repo_id:
+                    repo_id = repo_id.split("/")[-1]
+                repo_id = f"{organization}/{repo_id}"
+
+        url = create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True)
+        return url.repo_id
+
+    def _get_files_timestamps(self, working_dir: Union[str, os.PathLike]):
+        """
+        Returns the list of files with their last modification timestamp.
+        """
+        return {f: os.path.getmtime(os.path.join(working_dir, f)) for f in os.listdir(working_dir)}
+
+    def _upload_modified_files(
+        self,
+        working_dir: Union[str, os.PathLike],
+        repo_id: str,
+        files_timestamps: dict[str, float],
+        commit_message: Optional[str] = None,
+        token: Optional[Union[bool, str]] = None,
+        create_pr: bool = False,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
+    ):
+        """
+        Uploads all modified files in `working_dir` to `repo_id`, based on `files_timestamps`.
+        """
+        if commit_message is None:
+            if "Model" in self.__class__.__name__:
+                commit_message = "Upload model"
+            elif "Config" in self.__class__.__name__:
+                commit_message = "Upload config"
+            elif "Tokenizer" in self.__class__.__name__:
+                commit_message = "Upload tokenizer"
+            elif "FeatureExtractor" in self.__class__.__name__:
+                commit_message = "Upload feature extractor"
+            elif "Processor" in self.__class__.__name__:
+                commit_message = "Upload processor"
+            else:
+                commit_message = f"Upload {self.__class__.__name__}"
+        modified_files = [
+            f
+            for f in os.listdir(working_dir)
+            if f not in files_timestamps or os.path.getmtime(os.path.join(working_dir, f)) > files_timestamps[f]
+        ]
+
+        # filter for actual files + folders at the root level
+        modified_files = [
+            f
+            for f in modified_files
+            if os.path.isfile(os.path.join(working_dir, f)) or os.path.isdir(os.path.join(working_dir, f))
+        ]
+
+        operations = []
+        # upload standalone files
+        for file in modified_files:
+            if os.path.isdir(os.path.join(working_dir, file)):
+                # go over individual files of folder
+                for f in os.listdir(os.path.join(working_dir, file)):
+                    operations.append(
+                        CommitOperationAdd(
+                            path_or_fileobj=os.path.join(working_dir, file, f), path_in_repo=os.path.join(file, f)
+                        )
+                    )
+            else:
+                operations.append(
+                    CommitOperationAdd(path_or_fileobj=os.path.join(working_dir, file), path_in_repo=file)
+                )
+
+        if revision is not None and not revision.startswith("refs/pr"):
+            try:
+                create_branch(repo_id=repo_id, branch=revision, token=token, exist_ok=True)
+            except HfHubHTTPError as e:
+                if e.response.status_code == 403 and create_pr:
+                    # If we are creating a PR on a repo we don't have access to, we can't create the branch.
+                    # so let's assume the branch already exists. If it's not the case, an error will be raised when
+                    # calling `create_commit` below.
+                    pass
+                else:
+                    raise
+
+        logger.info(f"Uploading the following files to {repo_id}: {','.join(modified_files)}")
+        return create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            token=token,
+            create_pr=create_pr,
+            revision=revision,
+        )
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        use_temp_dir: Optional[bool] = None,
+        commit_message: Optional[str] = None,
+        private: Optional[bool] = None,
+        token: Optional[Union[bool, str]] = None,
+        max_shard_size: Optional[Union[int, str]] = "5GB",
+        create_pr: bool = False,
+        safe_serialization: bool = True,
+        revision: Optional[str] = None,
+        commit_description: Optional[str] = None,
+        tags: Optional[list[str]] = None,
+        **deprecated_kwargs,
+    ) -> str:
+        """
+        Upload the {object_files} to the 🤗 Model Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your {object} to. It should contain your organization name
+                when pushing to a given organization.
+            use_temp_dir (`bool`, *optional*):
+                Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.
+                Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.
+            commit_message (`str`, *optional*):
+                Message to commit while pushing. Will default to `"Upload {object}"`.
+            private (`bool`, *optional*):
+                Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
+            token (`bool` or `str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+                is not specified.
+            max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`):
+                Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
+                will then be each of size lower than this size. If expressed as a string, needs to be digits followed
+                by a unit (like `"5MB"`). We default it to `"5GB"` so that users can easily load models on free-tier
+                Google Colab instances without any CPU OOM issues.
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether or not to convert the model weights in safetensors format for safer serialization.
+            revision (`str`, *optional*):
+                Branch to push the uploaded files to.
+            commit_description (`str`, *optional*):
+                The description of the commit that will be created
+            tags (`List[str]`, *optional*):
+                List of tags to push on the Hub.
+
+        Examples:
+
+        ```python
+        from transformers import {object_class}
+
+        {object} = {object_class}.from_pretrained("google-bert/bert-base-cased")
+
+        # Push the {object} to your namespace with the name "my-finetuned-bert".
+        {object}.push_to_hub("my-finetuned-bert")
+
+        # Push the {object} to an organization with the name "my-finetuned-bert".
+        {object}.push_to_hub("huggingface/my-finetuned-bert")
+        ```
+        """
+        use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+        ignore_metadata_errors = deprecated_kwargs.pop("ignore_metadata_errors", False)
+        save_jinja_files = deprecated_kwargs.pop(
+            "save_jinja_files", None
+        )  # TODO: This is only used for testing and should be removed once save_jinja_files becomes the default
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            token = use_auth_token
+
+        repo_path_or_name = deprecated_kwargs.pop("repo_path_or_name", None)
+        if repo_path_or_name is not None:
+            # Should use `repo_id` instead of `repo_path_or_name`. When using `repo_path_or_name`, we try to infer
+            # repo_id from the folder path, if it exists.
+            warnings.warn(
+                "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
+                "`repo_id` instead.",
+                FutureWarning,
+            )
+            if repo_id is not None:
+                raise ValueError(
+                    "`repo_id` and `repo_path_or_name` are both specified. Please set only the argument `repo_id`."
+                )
+            if os.path.isdir(repo_path_or_name):
+                # repo_path: infer repo_id from the path
+                repo_id = repo_path_or_name.split(os.path.sep)[-1]
+                working_dir = repo_id
+            else:
+                # repo_name: use it as repo_id
+                repo_id = repo_path_or_name
+                working_dir = repo_id.split("/")[-1]
+        else:
+            # Repo_id is passed correctly: infer working_dir from it
+            working_dir = repo_id.split("/")[-1]
+
+        # Deprecation warning will be sent after for repo_url and organization
+        repo_url = deprecated_kwargs.pop("repo_url", None)
+        organization = deprecated_kwargs.pop("organization", None)
+
+        repo_id = self._create_repo(
+            repo_id, private=private, token=token, repo_url=repo_url, organization=organization
+        )
+
+        # Create a new empty model card and eventually tag it
+        model_card = create_and_tag_model_card(
+            repo_id, tags, token=token, ignore_metadata_errors=ignore_metadata_errors
+        )
+
+        if use_temp_dir is None:
+            use_temp_dir = not os.path.isdir(working_dir)
+
+        with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir:
+            files_timestamps = self._get_files_timestamps(work_dir)
+
+            # Save all files.
+            if save_jinja_files:
+                self.save_pretrained(
+                    work_dir,
+                    max_shard_size=max_shard_size,
+                    safe_serialization=safe_serialization,
+                    save_jinja_files=True,
+                )
+            else:
+                self.save_pretrained(work_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
+
+            # Update model card if needed:
+            model_card.save(os.path.join(work_dir, "README.md"))
+
+            return self._upload_modified_files(
+                work_dir,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=token,
+                create_pr=create_pr,
+                revision=revision,
+                commit_description=commit_description,
+            )
+
+
+def send_example_telemetry(example_name, *example_args, framework="pytorch"):
+    """
+    Sends telemetry that helps tracking the examples use.
+
+    Args:
+        example_name (`str`): The name of the example.
+        *example_args (dataclasses or `argparse.ArgumentParser`): The arguments to the script. This function will only
+            try to extract the model and dataset name from those. Nothing else is tracked.
+        framework (`str`, *optional*, defaults to `"pytorch"`): The framework for the example.
+    """
+    if is_offline_mode():
+        return
+
+    data = {"example": example_name, "framework": framework}
+    for args in example_args:
+        args_as_dict = {k: v for k, v in args.__dict__.items() if not k.startswith("_") and v is not None}
+        if "model_name_or_path" in args_as_dict:
+            model_name = args_as_dict["model_name_or_path"]
+            # Filter out local paths
+            if not os.path.isdir(model_name):
+                data["model_name"] = args_as_dict["model_name_or_path"]
+        if "dataset_name" in args_as_dict:
+            data["dataset_name"] = args_as_dict["dataset_name"]
+        elif "task_name" in args_as_dict:
+            # Extract script name from the example_name
+            script_name = example_name.replace("tf_", "").replace("flax_", "").replace("run_", "")
+            script_name = script_name.replace("_no_trainer", "")
+            data["dataset_name"] = f"{script_name}-{args_as_dict['task_name']}"
+
+    # Send telemetry in the background
+    send_telemetry(
+        topic="examples", library_name="transformers", library_version=__version__, user_agent=http_user_agent(data)
+    )
+
+
+def convert_file_size_to_int(size: Union[int, str]):
+    """
+    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
+
+    Args:
+        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
+
+    Example:
+    ```py
+    >>> convert_file_size_to_int("1MiB")
+    1048576
+    ```
+    """
+    if isinstance(size, int):
+        return size
+    if size.upper().endswith("GIB"):
+        return int(size[:-3]) * (2**30)
+    if size.upper().endswith("MIB"):
+        return int(size[:-3]) * (2**20)
+    if size.upper().endswith("KIB"):
+        return int(size[:-3]) * (2**10)
+    if size.upper().endswith("GB"):
+        int_size = int(size[:-2]) * (10**9)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("MB"):
+        int_size = int(size[:-2]) * (10**6)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("KB"):
+        int_size = int(size[:-2]) * (10**3)
+        return int_size // 8 if size.endswith("b") else int_size
+    raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
+
+
+def get_checkpoint_shard_files(
+    pretrained_model_name_or_path,
+    index_filename,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    resume_download=None,
+    local_files_only=False,
+    token=None,
+    user_agent=None,
+    revision=None,
+    subfolder="",
+    _commit_hash=None,
+    **deprecated_kwargs,
+):
+    """
+    For a given model:
+
+    - download and cache all the shards of a sharded checkpoint if `pretrained_model_name_or_path` is a model ID on the
+      Hub
+    - returns the list of paths to all the shards, as well as some metadata.
+
+    For the description of each arg, see [`PreTrainedModel.from_pretrained`]. `index_filename` is the full path to the
+    index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub).
+    """
+    import json
+
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if not os.path.isfile(index_filename):
+        raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
+
+    with open(index_filename) as f:
+        index = json.loads(f.read())
+
+    shard_filenames = sorted(set(index["weight_map"].values()))
+    sharded_metadata = index["metadata"]
+    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
+    sharded_metadata["weight_map"] = index["weight_map"].copy()
+
+    # First, let's deal with local folder.
+    if os.path.isdir(pretrained_model_name_or_path):
+        shard_filenames = [os.path.join(pretrained_model_name_or_path, subfolder, f) for f in shard_filenames]
+        return shard_filenames, sharded_metadata
+
+    # At this stage pretrained_model_name_or_path is a model identifier on the Hub. Try to get everything from cache,
+    # or download the files
+    cached_filenames = cached_files(
+        pretrained_model_name_or_path,
+        shard_filenames,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+        token=token,
+        user_agent=user_agent,
+        revision=revision,
+        subfolder=subfolder,
+        _commit_hash=_commit_hash,
+    )
+
+    return cached_filenames, sharded_metadata
+
+
+def create_and_tag_model_card(
+    repo_id: str,
+    tags: Optional[list[str]] = None,
+    token: Optional[str] = None,
+    ignore_metadata_errors: bool = False,
+):
+    """
+    Creates or loads an existing model card and tags it.
+
+    Args:
+        repo_id (`str`):
+            The repo_id where to look for the model card.
+        tags (`List[str]`, *optional*):
+            The list of tags to add in the model card
+        token (`str`, *optional*):
+            Authentication token, obtained with `huggingface_hub.HfApi.login` method. Will default to the stored token.
+        ignore_metadata_errors (`bool`, *optional*, defaults to `False`):
+            If True, errors while parsing the metadata section will be ignored. Some information might be lost during
+            the process. Use it at your own risk.
+    """
+    try:
+        # Check if the model card is present on the remote repo
+        model_card = ModelCard.load(repo_id, token=token, ignore_metadata_errors=ignore_metadata_errors)
+    except EntryNotFoundError:
+        # Otherwise create a simple model card from template
+        model_description = "This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated."
+        card_data = ModelCardData(tags=[] if tags is None else tags, library_name="transformers")
+        model_card = ModelCard.from_template(card_data, model_description=model_description)
+
+    if tags is not None:
+        # Ensure model_card.data.tags is a list and not None
+        if model_card.data.tags is None:
+            model_card.data.tags = []
+        for model_tag in tags:
+            if model_tag not in model_card.data.tags:
+                model_card.data.tags.append(model_tag)
+
+    return model_card
+
+
+class PushInProgress:
+    """
+    Internal class to keep track of a push in progress (which might contain multiple `Future` jobs).
+    """
+
+    def __init__(self, jobs: Optional[futures.Future] = None) -> None:
+        self.jobs = [] if jobs is None else jobs
+
+    def is_done(self):
+        return all(job.done() for job in self.jobs)
+
+    def wait_until_done(self):
+        futures.wait(self.jobs)
+
+    def cancel(self) -> None:
+        self.jobs = [
+            job
+            for job in self.jobs
+            # Cancel the job if it wasn't started yet and remove cancelled/done jobs from the list
+            if not (job.cancel() or job.done())
+        ]
diff --git a/docs/transformers/build/lib/transformers/utils/import_utils.py b/docs/transformers/build/lib/transformers/utils/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eac33b7a2961b05f626bd429d8b50179be8a406
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/import_utils.py
@@ -0,0 +1,2460 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.machinery
+import importlib.metadata
+import importlib.util
+import json
+import os
+import shutil
+import subprocess
+import sys
+import warnings
+from collections import OrderedDict
+from functools import lru_cache
+from itertools import chain
+from types import ModuleType
+from typing import Any, Dict, FrozenSet, Optional, Set, Tuple, Union
+
+from packaging import version
+
+from . import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
+def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]:
+    # Check if the package spec exists and grab its version to avoid importing a local directory
+    package_exists = importlib.util.find_spec(pkg_name) is not None
+    package_version = "N/A"
+    if package_exists:
+        try:
+            # TODO: Once python 3.9 support is dropped, `importlib.metadata.packages_distributions()`
+            # should be used here to map from package name to distribution names
+            # e.g. PIL -> Pillow, Pillow-SIMD; quark -> amd-quark; onnxruntime -> onnxruntime-gpu.
+            # `importlib.metadata.packages_distributions()` is not available in Python 3.9.
+
+            # Primary method to get the package version
+            package_version = importlib.metadata.version(pkg_name)
+        except importlib.metadata.PackageNotFoundError:
+            # Fallback method: Only for "torch" and versions containing "dev"
+            if pkg_name == "torch":
+                try:
+                    package = importlib.import_module(pkg_name)
+                    temp_version = getattr(package, "__version__", "N/A")
+                    # Check if the version contains "dev"
+                    if "dev" in temp_version:
+                        package_version = temp_version
+                        package_exists = True
+                    else:
+                        package_exists = False
+                except ImportError:
+                    # If the package can't be imported, it's not available
+                    package_exists = False
+            elif pkg_name == "quark":
+                # TODO: remove once `importlib.metadata.packages_distributions()` is supported.
+                try:
+                    package_version = importlib.metadata.version("amd-quark")
+                except Exception:
+                    package_exists = False
+            else:
+                # For packages other than "torch", don't attempt the fallback and set as not available
+                package_exists = False
+        logger.debug(f"Detected {pkg_name} version: {package_version}")
+    if return_version:
+        return package_exists, package_version
+    else:
+        return package_exists
+
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+
+# Try to run a native pytorch job in an environment with TorchXLA installed by setting this value to 0.
+USE_TORCH_XLA = os.environ.get("USE_TORCH_XLA", "1").upper()
+
+FORCE_TF_AVAILABLE = os.environ.get("FORCE_TF_AVAILABLE", "AUTO").upper()
+
+# `transformers` requires `torch>=1.11` but this variable is exposed publicly, and we can't simply remove it.
+# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
+TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
+
+ACCELERATE_MIN_VERSION = "0.26.0"
+SCHEDULEFREE_MIN_VERSION = "1.2.6"
+FSDP_MIN_VERSION = "1.12.0"
+GGUF_MIN_VERSION = "0.10.0"
+XLA_FSDPV2_MIN_VERSION = "2.2.0"
+HQQ_MIN_VERSION = "0.2.1"
+VPTQ_MIN_VERSION = "0.0.4"
+TORCHAO_MIN_VERSION = "0.4.0"
+AUTOROUND_MIN_VERSION = "0.5.0"
+
+_accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
+_apex_available = _is_package_available("apex")
+_apollo_torch_available = _is_package_available("apollo_torch")
+_aqlm_available = _is_package_available("aqlm")
+_vptq_available, _vptq_version = _is_package_available("vptq", return_version=True)
+_av_available = importlib.util.find_spec("av") is not None
+_decord_available = importlib.util.find_spec("decord") is not None
+_bitsandbytes_available = _is_package_available("bitsandbytes")
+_eetq_available = _is_package_available("eetq")
+_fbgemm_gpu_available = _is_package_available("fbgemm_gpu")
+_galore_torch_available = _is_package_available("galore_torch")
+_lomo_available = _is_package_available("lomo_optim")
+_grokadamw_available = _is_package_available("grokadamw")
+_schedulefree_available, _schedulefree_version = _is_package_available("schedulefree", return_version=True)
+# `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
+_bs4_available = importlib.util.find_spec("bs4") is not None
+_coloredlogs_available = _is_package_available("coloredlogs")
+# `importlib.metadata.util` doesn't work with `opencv-python-headless`.
+_cv2_available = importlib.util.find_spec("cv2") is not None
+_yt_dlp_available = importlib.util.find_spec("yt_dlp") is not None
+_datasets_available = _is_package_available("datasets")
+_detectron2_available = _is_package_available("detectron2")
+# We need to check both `faiss` and `faiss-cpu`.
+_faiss_available = importlib.util.find_spec("faiss") is not None
+try:
+    _faiss_version = importlib.metadata.version("faiss")
+    logger.debug(f"Successfully imported faiss version {_faiss_version}")
+except importlib.metadata.PackageNotFoundError:
+    try:
+        _faiss_version = importlib.metadata.version("faiss-cpu")
+        logger.debug(f"Successfully imported faiss version {_faiss_version}")
+    except importlib.metadata.PackageNotFoundError:
+        _faiss_available = False
+_ftfy_available = _is_package_available("ftfy")
+_g2p_en_available = _is_package_available("g2p_en")
+_hadamard_available = _is_package_available("fast_hadamard_transform")
+_ipex_available, _ipex_version = _is_package_available("intel_extension_for_pytorch", return_version=True)
+_jieba_available = _is_package_available("jieba")
+_jinja_available = _is_package_available("jinja2")
+_kenlm_available = _is_package_available("kenlm")
+_keras_nlp_available = _is_package_available("keras_nlp")
+_levenshtein_available = _is_package_available("Levenshtein")
+_librosa_available = _is_package_available("librosa")
+_natten_available = _is_package_available("natten")
+_nltk_available = _is_package_available("nltk")
+_onnx_available = _is_package_available("onnx")
+_openai_available = _is_package_available("openai")
+_optimum_available = _is_package_available("optimum")
+_auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
+_auto_round_available, _auto_round_version = _is_package_available("auto_round", return_version=True)
+# `importlib.metadata.version` doesn't work with `awq`
+_auto_awq_available = importlib.util.find_spec("awq") is not None
+_quark_available = _is_package_available("quark")
+_is_optimum_quanto_available = False
+try:
+    importlib.metadata.version("optimum_quanto")
+    _is_optimum_quanto_available = True
+except importlib.metadata.PackageNotFoundError:
+    _is_optimum_quanto_available = False
+# For compressed_tensors, only check spec to allow compressed_tensors-nightly package
+_compressed_tensors_available = importlib.util.find_spec("compressed_tensors") is not None
+_pandas_available = _is_package_available("pandas")
+_peft_available = _is_package_available("peft")
+_phonemizer_available = _is_package_available("phonemizer")
+_uroman_available = _is_package_available("uroman")
+_psutil_available = _is_package_available("psutil")
+_py3nvml_available = _is_package_available("py3nvml")
+_pyctcdecode_available = _is_package_available("pyctcdecode")
+_pygments_available = _is_package_available("pygments")
+_pytesseract_available = _is_package_available("pytesseract")
+_pytest_available = _is_package_available("pytest")
+_pytorch_quantization_available = _is_package_available("pytorch_quantization")
+_rjieba_available = _is_package_available("rjieba")
+_sacremoses_available = _is_package_available("sacremoses")
+_safetensors_available = _is_package_available("safetensors")
+_scipy_available = _is_package_available("scipy")
+_sentencepiece_available = _is_package_available("sentencepiece")
+_is_seqio_available = _is_package_available("seqio")
+_is_gguf_available, _gguf_version = _is_package_available("gguf", return_version=True)
+_sklearn_available = importlib.util.find_spec("sklearn") is not None
+if _sklearn_available:
+    try:
+        importlib.metadata.version("scikit-learn")
+    except importlib.metadata.PackageNotFoundError:
+        _sklearn_available = False
+_smdistributed_available = importlib.util.find_spec("smdistributed") is not None
+_soundfile_available = _is_package_available("soundfile")
+_spacy_available = _is_package_available("spacy")
+_sudachipy_available, _sudachipy_version = _is_package_available("sudachipy", return_version=True)
+_tensorflow_probability_available = _is_package_available("tensorflow_probability")
+_tensorflow_text_available = _is_package_available("tensorflow_text")
+_tf2onnx_available = _is_package_available("tf2onnx")
+_timm_available = _is_package_available("timm")
+_tokenizers_available = _is_package_available("tokenizers")
+_torchaudio_available = _is_package_available("torchaudio")
+_torchao_available, _torchao_version = _is_package_available("torchao", return_version=True)
+_torchdistx_available = _is_package_available("torchdistx")
+_torchvision_available, _torchvision_version = _is_package_available("torchvision", return_version=True)
+_mlx_available = _is_package_available("mlx")
+_num2words_available = _is_package_available("num2words")
+_hqq_available, _hqq_version = _is_package_available("hqq", return_version=True)
+_tiktoken_available = _is_package_available("tiktoken")
+_blobfile_available = _is_package_available("blobfile")
+_liger_kernel_available = _is_package_available("liger_kernel")
+_triton_available = _is_package_available("triton")
+_spqr_available = _is_package_available("spqr_quant")
+_rich_available = _is_package_available("rich")
+_kernels_available = _is_package_available("kernels")
+
+_torch_version = "N/A"
+_torch_available = False
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available, _torch_version = _is_package_available("torch", return_version=True)
+    if _torch_available:
+        _torch_available = version.parse(_torch_version) >= version.parse("2.1.0")
+        if not _torch_available:
+            logger.warning(f"Disabling PyTorch because PyTorch >= 2.1 is required but found {_torch_version}")
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+    _torch_available = False
+
+
+_tf_version = "N/A"
+_tf_available = False
+if FORCE_TF_AVAILABLE in ENV_VARS_TRUE_VALUES:
+    _tf_available = True
+else:
+    if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+        # Note: _is_package_available("tensorflow") fails for tensorflow-cpu. Please test any changes to the line below
+        # with tensorflow-cpu to make sure it still works!
+        _tf_available = importlib.util.find_spec("tensorflow") is not None
+        if _tf_available:
+            candidates = (
+                "tensorflow",
+                "tensorflow-cpu",
+                "tensorflow-gpu",
+                "tf-nightly",
+                "tf-nightly-cpu",
+                "tf-nightly-gpu",
+                "tf-nightly-rocm",
+                "intel-tensorflow",
+                "intel-tensorflow-avx512",
+                "tensorflow-rocm",
+                "tensorflow-macos",
+                "tensorflow-aarch64",
+            )
+            _tf_version = None
+            # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+            for pkg in candidates:
+                try:
+                    _tf_version = importlib.metadata.version(pkg)
+                    break
+                except importlib.metadata.PackageNotFoundError:
+                    pass
+            _tf_available = _tf_version is not None
+        if _tf_available:
+            if version.parse(_tf_version) < version.parse("2"):
+                logger.info(
+                    f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum."
+                )
+                _tf_available = False
+    else:
+        logger.info("Disabling Tensorflow because USE_TORCH is set")
+
+
+_essentia_available = importlib.util.find_spec("essentia") is not None
+try:
+    _essentia_version = importlib.metadata.version("essentia")
+    logger.debug(f"Successfully imported essentia version {_essentia_version}")
+except importlib.metadata.PackageNotFoundError:
+    _essentia_version = False
+
+
+_pretty_midi_available = importlib.util.find_spec("pretty_midi") is not None
+try:
+    _pretty_midi_version = importlib.metadata.version("pretty_midi")
+    logger.debug(f"Successfully imported pretty_midi version {_pretty_midi_version}")
+except importlib.metadata.PackageNotFoundError:
+    _pretty_midi_available = False
+
+
+ccl_version = "N/A"
+_is_ccl_available = (
+    importlib.util.find_spec("torch_ccl") is not None
+    or importlib.util.find_spec("oneccl_bindings_for_pytorch") is not None
+)
+try:
+    ccl_version = importlib.metadata.version("oneccl_bind_pt")
+    logger.debug(f"Detected oneccl_bind_pt version {ccl_version}")
+except importlib.metadata.PackageNotFoundError:
+    _is_ccl_available = False
+
+
+_flax_available = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available, _flax_version = _is_package_available("flax", return_version=True)
+    if _flax_available:
+        _jax_available, _jax_version = _is_package_available("jax", return_version=True)
+        if _jax_available:
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        else:
+            _flax_available = _jax_available = False
+            _jax_version = _flax_version = "N/A"
+
+
+_torch_xla_available = False
+if USE_TORCH_XLA in ENV_VARS_TRUE_VALUES:
+    _torch_xla_available, _torch_xla_version = _is_package_available("torch_xla", return_version=True)
+    if _torch_xla_available:
+        logger.info(f"Torch XLA version {_torch_xla_version} available.")
+
+
+def is_kenlm_available():
+    return _kenlm_available
+
+
+def is_kernels_available():
+    return _kernels_available
+
+
+def is_cv2_available():
+    return _cv2_available
+
+
+def is_yt_dlp_available():
+    return _yt_dlp_available
+
+
+def is_torch_available():
+    return _torch_available
+
+
+def is_accelerate_available(min_version: str = ACCELERATE_MIN_VERSION):
+    return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
+
+
+def is_torch_accelerator_available():
+    if is_torch_available():
+        import torch
+
+        return hasattr(torch, "accelerator")
+
+    return False
+
+
+def is_torch_deterministic():
+    """
+    Check whether pytorch uses deterministic algorithms by looking if torch.set_deterministic_debug_mode() is set to 1 or 2"
+    """
+    if is_torch_available():
+        import torch
+
+        if torch.get_deterministic_debug_mode() == 0:
+            return False
+        else:
+            return True
+
+    return False
+
+
+def is_hadamard_available():
+    return _hadamard_available
+
+
+def is_hqq_available(min_version: str = HQQ_MIN_VERSION):
+    return _hqq_available and version.parse(_hqq_version) >= version.parse(min_version)
+
+
+def is_pygments_available():
+    return _pygments_available
+
+
+def get_torch_version():
+    return _torch_version
+
+
+def is_torch_sdpa_available():
+    if not is_torch_available():
+        return False
+    elif _torch_version == "N/A":
+        return False
+
+    # NOTE: MLU is OK with non-contiguous inputs.
+    if is_torch_mlu_available():
+        return True
+    # NOTE: NPU can use SDPA in Transformers with torch>=2.1.0.
+    if is_torch_npu_available():
+        return True
+    # NOTE: We require torch>=2.1.1 to avoid a numerical issue in SDPA with non-contiguous inputs: https://github.com/pytorch/pytorch/issues/112577
+    return version.parse(_torch_version) >= version.parse("2.1.1")
+
+
+def is_torch_flex_attn_available():
+    if not is_torch_available():
+        return False
+    elif _torch_version == "N/A":
+        return False
+
+    # TODO check if some bugs cause push backs on the exact version
+    # NOTE: We require torch>=2.5.0 as it is the first release
+    return version.parse(_torch_version) >= version.parse("2.5.0")
+
+
+def is_torchvision_available():
+    return _torchvision_available
+
+
+def is_torchvision_v2_available():
+    if not is_torchvision_available():
+        return False
+
+    # NOTE: We require torchvision>=0.15 as v2 transforms are available from this version: https://pytorch.org/vision/stable/transforms.html#v1-or-v2-which-one-should-i-use
+    return version.parse(_torchvision_version) >= version.parse("0.15")
+
+
+def is_galore_torch_available():
+    return _galore_torch_available
+
+
+def is_apollo_torch_available():
+    return _apollo_torch_available
+
+
+def is_lomo_available():
+    return _lomo_available
+
+
+def is_grokadamw_available():
+    return _grokadamw_available
+
+
+def is_schedulefree_available(min_version: str = SCHEDULEFREE_MIN_VERSION):
+    return _schedulefree_available and version.parse(_schedulefree_version) >= version.parse(min_version)
+
+
+def is_pyctcdecode_available():
+    return _pyctcdecode_available
+
+
+def is_librosa_available():
+    return _librosa_available
+
+
+def is_essentia_available():
+    return _essentia_available
+
+
+def is_pretty_midi_available():
+    return _pretty_midi_available
+
+
+def is_torch_cuda_available():
+    if is_torch_available():
+        import torch
+
+        return torch.cuda.is_available()
+    else:
+        return False
+
+
+def is_mamba_ssm_available():
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        else:
+            return _is_package_available("mamba_ssm")
+    return False
+
+
+def is_mamba_2_ssm_available():
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        else:
+            if _is_package_available("mamba_ssm"):
+                import mamba_ssm
+
+                if version.parse(mamba_ssm.__version__) >= version.parse("2.0.4"):
+                    return True
+    return False
+
+
+def is_causal_conv1d_available():
+    if is_torch_available():
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        return _is_package_available("causal_conv1d")
+    return False
+
+
+def is_mambapy_available():
+    if is_torch_available():
+        return _is_package_available("mambapy")
+    return False
+
+
+def is_torch_mps_available(min_version: Optional[str] = None):
+    if is_torch_available():
+        import torch
+
+        if hasattr(torch.backends, "mps"):
+            backend_available = torch.backends.mps.is_available() and torch.backends.mps.is_built()
+            if min_version is not None:
+                flag = version.parse(_torch_version) >= version.parse(min_version)
+                backend_available = backend_available and flag
+            return backend_available
+    return False
+
+
+def is_torch_bf16_gpu_available() -> bool:
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if torch.cuda.is_available():
+        return torch.cuda.is_bf16_supported()
+    if torch.xpu.is_available():
+        return torch.xpu.is_bf16_supported()
+    if is_torch_hpu_available():
+        return True
+    return False
+
+
+def is_torch_bf16_cpu_available() -> bool:
+    return is_torch_available()
+
+
+def is_torch_bf16_available():
+    # the original bf16 check was for gpu only, but later a cpu/bf16 combo has emerged so this util
+    # has become ambiguous and therefore deprecated
+    warnings.warn(
+        "The util is_torch_bf16_available is deprecated, please use is_torch_bf16_gpu_available "
+        "or is_torch_bf16_cpu_available instead according to whether it's used with cpu or gpu",
+        FutureWarning,
+    )
+    return is_torch_bf16_gpu_available()
+
+
+@lru_cache()
+def is_torch_fp16_available_on_device(device):
+    if not is_torch_available():
+        return False
+
+    if is_torch_hpu_available():
+        if is_habana_gaudi1():
+            return False
+        else:
+            return True
+
+    import torch
+
+    try:
+        x = torch.zeros(2, 2, dtype=torch.float16, device=device)
+        _ = x @ x
+
+        # At this moment, let's be strict of the check: check if `LayerNorm` is also supported on device, because many
+        # models use this layer.
+        batch, sentence_length, embedding_dim = 3, 4, 5
+        embedding = torch.randn(batch, sentence_length, embedding_dim, dtype=torch.float16, device=device)
+        layer_norm = torch.nn.LayerNorm(embedding_dim, dtype=torch.float16, device=device)
+        _ = layer_norm(embedding)
+
+    except:  # noqa: E722
+        # TODO: more precise exception matching, if possible.
+        # most backends should return `RuntimeError` however this is not guaranteed.
+        return False
+
+    return True
+
+
+@lru_cache()
+def is_torch_bf16_available_on_device(device):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if device == "cuda":
+        return is_torch_bf16_gpu_available()
+
+    if device == "hpu":
+        return True
+
+    try:
+        x = torch.zeros(2, 2, dtype=torch.bfloat16, device=device)
+        _ = x @ x
+    except:  # noqa: E722
+        # TODO: more precise exception matching, if possible.
+        # most backends should return `RuntimeError` however this is not guaranteed.
+        return False
+
+    return True
+
+
+def is_torch_tf32_available():
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    if not torch.cuda.is_available() or torch.version.cuda is None:
+        return False
+    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
+        return False
+    return True
+
+
+def is_torch_fx_available():
+    return is_torch_available()
+
+
+def is_peft_available():
+    return _peft_available
+
+
+def is_bs4_available():
+    return _bs4_available
+
+
+def is_tf_available():
+    return _tf_available
+
+
+def is_coloredlogs_available():
+    return _coloredlogs_available
+
+
+def is_tf2onnx_available():
+    return _tf2onnx_available
+
+
+def is_onnx_available():
+    return _onnx_available
+
+
+def is_openai_available():
+    return _openai_available
+
+
+def is_flax_available():
+    return _flax_available
+
+
+def is_flute_available():
+    try:
+        return importlib.util.find_spec("flute") is not None and importlib.metadata.version("flute-kernel") >= "0.4.1"
+    except importlib.metadata.PackageNotFoundError:
+        return False
+
+
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_g2p_en_available():
+    return _g2p_en_available
+
+
+@lru_cache
+def is_torch_xla_available(check_is_tpu=False, check_is_gpu=False):
+    """
+    Check if `torch_xla` is available. To train a native pytorch job in an environment with torch xla installed, set
+    the USE_TORCH_XLA to false.
+    """
+    assert not (check_is_tpu and check_is_gpu), "The check_is_tpu and check_is_gpu cannot both be true."
+
+    if not _torch_xla_available:
+        return False
+
+    import torch_xla
+
+    if check_is_gpu:
+        return torch_xla.runtime.device_type() in ["GPU", "CUDA"]
+    elif check_is_tpu:
+        return torch_xla.runtime.device_type() == "TPU"
+
+    return True
+
+
+@lru_cache()
+def is_torch_neuroncore_available(check_device=True):
+    if importlib.util.find_spec("torch_neuronx") is not None:
+        return is_torch_xla_available()
+    return False
+
+
+@lru_cache()
+def is_torch_npu_available(check_device=False):
+    "Checks if `torch_npu` is installed and potentially if a NPU is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_npu") is None:
+        return False
+
+    import torch
+    import torch_npu  # noqa: F401
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no NPU is found
+            _ = torch.npu.device_count()
+            return torch.npu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "npu") and torch.npu.is_available()
+
+
+@lru_cache()
+def is_torch_mlu_available(check_device=False):
+    """
+    Checks if `mlu` is available via an `cndev-based` check which won't trigger the drivers and leave mlu
+    uninitialized.
+    """
+    if not _torch_available or importlib.util.find_spec("torch_mlu") is None:
+        return False
+
+    import torch
+    import torch_mlu  # noqa: F401
+
+    pytorch_cndev_based_mlu_check_previous_value = os.environ.get("PYTORCH_CNDEV_BASED_MLU_CHECK")
+    try:
+        os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = str(1)
+        available = torch.mlu.is_available()
+    finally:
+        if pytorch_cndev_based_mlu_check_previous_value:
+            os.environ["PYTORCH_CNDEV_BASED_MLU_CHECK"] = pytorch_cndev_based_mlu_check_previous_value
+        else:
+            os.environ.pop("PYTORCH_CNDEV_BASED_MLU_CHECK", None)
+
+    return available
+
+
+@lru_cache()
+def is_torch_musa_available(check_device=False):
+    "Checks if `torch_musa` is installed and potentially if a MUSA is in the environment"
+    if not _torch_available or importlib.util.find_spec("torch_musa") is None:
+        return False
+
+    import torch
+    import torch_musa  # noqa: F401
+
+    torch_musa_min_version = "0.33.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_musa_min_version):
+        return False
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no MUSA is found
+            _ = torch.musa.device_count()
+            return torch.musa.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "musa") and torch.musa.is_available()
+
+
+@lru_cache
+def is_torch_hpu_available():
+    "Checks if `torch.hpu` is available and potentially if a HPU is in the environment"
+    if (
+        not _torch_available
+        or importlib.util.find_spec("habana_frameworks") is None
+        or importlib.util.find_spec("habana_frameworks.torch") is None
+    ):
+        return False
+
+    torch_hpu_min_version = "1.5.0"
+    if _accelerate_available and version.parse(_accelerate_version) < version.parse(torch_hpu_min_version):
+        return False
+
+    import torch
+
+    if os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
+        # import habana_frameworks.torch in case of lazy mode to patch torch with torch.hpu
+        import habana_frameworks.torch  # noqa: F401
+
+    if not hasattr(torch, "hpu") or not torch.hpu.is_available():
+        return False
+
+    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
+
+    # IlyasMoutawwakil: We patch masked_fill_ for int64 tensors to avoid a bug on Gaudi1
+    # synNodeCreateWithId failed for node: masked_fill_fwd_i64 with synStatus 26 [Generic failure]
+    # This can be removed once Gaudi1 support is discontinued but for now we need it to keep using
+    # dl1.24xlarge Gaudi1 instances on AWS for testing.
+    # check if the device is Gaudi1 (vs Gaudi2, Gaudi3).
+    if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi:
+        original_masked_fill_ = torch.Tensor.masked_fill_
+
+        def patched_masked_fill_(self, mask, value):
+            if self.dtype == torch.int64:
+                logger.warning_once(
+                    "In-place tensor.masked_fill_(mask, value) is not supported for int64 tensors on Gaudi1. "
+                    "This operation will be performed out-of-place using tensor[mask] = value."
+                )
+                self[mask] = value
+            else:
+                original_masked_fill_(self, mask, value)
+
+        torch.Tensor.masked_fill_ = patched_masked_fill_
+
+    return True
+
+
+@lru_cache
+def is_habana_gaudi1():
+    if not is_torch_hpu_available():
+        return False
+
+    import habana_frameworks.torch.utils.experimental as htexp  # noqa: F401
+
+    # Check if the device is Gaudi1 (vs Gaudi2, Gaudi3)
+    return htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi
+
+
+def is_torchdynamo_available():
+    return is_torch_available()
+
+
+def is_torch_compile_available():
+    return is_torch_available()
+
+
+def is_torchdynamo_compiling():
+    if not is_torch_available():
+        return False
+
+    # Importing torch._dynamo causes issues with PyTorch profiler (https://github.com/pytorch/pytorch/issues/130622)
+    # hence rather relying on `torch.compiler.is_compiling()` when possible (torch>=2.3)
+    try:
+        import torch
+
+        return torch.compiler.is_compiling()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo  # noqa: F401
+
+            return dynamo.is_compiling()
+        except Exception:
+            return False
+
+
+def is_torchdynamo_exporting():
+    if not is_torch_available():
+        return False
+
+    try:
+        import torch
+
+        return torch.compiler.is_exporting()
+    except Exception:
+        try:
+            import torch._dynamo as dynamo  # noqa: F401
+
+            return dynamo.is_exporting()
+        except Exception:
+            return False
+
+
+def is_torch_tensorrt_fx_available():
+    if importlib.util.find_spec("torch_tensorrt") is None:
+        return False
+    return importlib.util.find_spec("torch_tensorrt.fx") is not None
+
+
+def is_datasets_available():
+    return _datasets_available
+
+
+def is_detectron2_available():
+    return _detectron2_available
+
+
+def is_rjieba_available():
+    return _rjieba_available
+
+
+def is_psutil_available():
+    return _psutil_available
+
+
+def is_py3nvml_available():
+    return _py3nvml_available
+
+
+def is_sacremoses_available():
+    return _sacremoses_available
+
+
+def is_apex_available():
+    return _apex_available
+
+
+def is_aqlm_available():
+    return _aqlm_available
+
+
+def is_vptq_available(min_version: str = VPTQ_MIN_VERSION):
+    return _vptq_available and version.parse(_vptq_version) >= version.parse(min_version)
+
+
+def is_av_available():
+    return _av_available
+
+
+def is_decord_available():
+    return _decord_available
+
+
+def is_ninja_available():
+    r"""
+    Code comes from *torch.utils.cpp_extension.is_ninja_available()*. Returns `True` if the
+    [ninja](https://ninja-build.org/) build system is available on the system, `False` otherwise.
+    """
+    try:
+        subprocess.check_output("ninja --version".split())
+    except Exception:
+        return False
+    else:
+        return True
+
+
+def is_ipex_available(min_version: str = ""):
+    def get_major_and_minor_from_version(full_version):
+        return str(version.parse(full_version).major) + "." + str(version.parse(full_version).minor)
+
+    if not is_torch_available() or not _ipex_available:
+        return False
+
+    torch_major_and_minor = get_major_and_minor_from_version(_torch_version)
+    ipex_major_and_minor = get_major_and_minor_from_version(_ipex_version)
+    if torch_major_and_minor != ipex_major_and_minor:
+        logger.warning(
+            f"Intel Extension for PyTorch {ipex_major_and_minor} needs to work with PyTorch {ipex_major_and_minor}.*,"
+            f" but PyTorch {_torch_version} is found. Please switch to the matching version and run again."
+        )
+        return False
+    if min_version:
+        return version.parse(_ipex_version) >= version.parse(min_version)
+    return True
+
+
+@lru_cache
+def is_torch_xpu_available(check_device=False):
+    """
+    Checks if XPU acceleration is available either via native PyTorch (>=2.6),
+    `intel_extension_for_pytorch` or via stock PyTorch (>=2.4) and potentially
+    if a XPU is in the environment.
+    """
+    if not is_torch_available():
+        return False
+
+    torch_version = version.parse(_torch_version)
+    if torch_version.major == 2 and torch_version.minor < 6:
+        if is_ipex_available():
+            import intel_extension_for_pytorch  # noqa: F401
+        elif torch_version.major == 2 and torch_version.minor < 4:
+            return False
+
+    import torch
+
+    if check_device:
+        try:
+            # Will raise a RuntimeError if no XPU  is found
+            _ = torch.xpu.device_count()
+            return torch.xpu.is_available()
+        except RuntimeError:
+            return False
+    return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+@lru_cache()
+def is_bitsandbytes_available():
+    if not is_torch_available() or not _bitsandbytes_available:
+        return False
+
+    import torch
+
+    # `bitsandbytes` versions older than 0.43.1 eagerly require CUDA at import time,
+    # so those versions of the library are practically only available when CUDA is too.
+    if version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.1"):
+        return torch.cuda.is_available()
+
+    # Newer versions of `bitsandbytes` can be imported on systems without CUDA.
+    return True
+
+
+def is_bitsandbytes_multi_backend_available() -> bool:
+    if not is_bitsandbytes_available():
+        return False
+
+    import bitsandbytes as bnb
+
+    return "multi_backend" in getattr(bnb, "features", set())
+
+
+def is_flash_attn_2_available():
+    if not is_torch_available():
+        return False
+
+    if not _is_package_available("flash_attn"):
+        return False
+
+    # Let's add an extra check to see if cuda is available
+    import torch
+
+    if not (torch.cuda.is_available() or is_torch_mlu_available()):
+        return False
+
+    if torch.version.cuda:
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+    elif torch.version.hip:
+        # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
+    elif is_torch_mlu_available():
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.3.3")
+    else:
+        return False
+
+
+@lru_cache()
+def is_flash_attn_greater_or_equal_2_10():
+    if not _is_package_available("flash_attn"):
+        return False
+
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+
+
+@lru_cache()
+def is_flash_attn_greater_or_equal(library_version: str):
+    if not _is_package_available("flash_attn"):
+        return False
+
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(library_version)
+
+
+@lru_cache()
+def is_torch_greater_or_equal(library_version: str, accept_dev: bool = False):
+    """
+    Accepts a library version and returns True if the current version of the library is greater than or equal to the
+    given version. If `accept_dev` is True, it will also accept development versions (e.g. 2.7.0.dev20250320 matches
+    2.7.0).
+    """
+    if not _is_package_available("torch"):
+        return False
+
+    if accept_dev:
+        return version.parse(version.parse(importlib.metadata.version("torch")).base_version) >= version.parse(
+            library_version
+        )
+    else:
+        return version.parse(importlib.metadata.version("torch")) >= version.parse(library_version)
+
+
+def is_torchdistx_available():
+    return _torchdistx_available
+
+
+def is_faiss_available():
+    return _faiss_available
+
+
+def is_scipy_available():
+    return _scipy_available
+
+
+def is_sklearn_available():
+    return _sklearn_available
+
+
+def is_sentencepiece_available():
+    return _sentencepiece_available
+
+
+def is_seqio_available():
+    return _is_seqio_available
+
+
+def is_gguf_available(min_version: str = GGUF_MIN_VERSION):
+    return _is_gguf_available and version.parse(_gguf_version) >= version.parse(min_version)
+
+
+def is_protobuf_available():
+    if importlib.util.find_spec("google") is None:
+        return False
+    return importlib.util.find_spec("google.protobuf") is not None
+
+
+def is_fsdp_available(min_version: str = FSDP_MIN_VERSION):
+    return is_torch_available() and version.parse(_torch_version) >= version.parse(min_version)
+
+
+def is_optimum_available():
+    return _optimum_available
+
+
+def is_auto_awq_available():
+    return _auto_awq_available
+
+
+def is_auto_round_available(min_version: str = AUTOROUND_MIN_VERSION):
+    return _auto_round_available and version.parse(_auto_round_version) >= version.parse(min_version)
+
+
+def is_optimum_quanto_available():
+    # `importlib.metadata.version` doesn't work with `optimum.quanto`, need to put `optimum_quanto`
+    return _is_optimum_quanto_available
+
+
+def is_quark_available():
+    return _quark_available
+
+
+def is_compressed_tensors_available():
+    return _compressed_tensors_available
+
+
+def is_auto_gptq_available():
+    return _auto_gptq_available
+
+
+def is_gptqmodel_available():
+    return _gptqmodel_available
+
+
+def is_eetq_available():
+    return _eetq_available
+
+
+def is_fbgemm_gpu_available():
+    return _fbgemm_gpu_available
+
+
+def is_levenshtein_available():
+    return _levenshtein_available
+
+
+def is_optimum_neuron_available():
+    return _optimum_available and _is_package_available("optimum.neuron")
+
+
+def is_safetensors_available():
+    return _safetensors_available
+
+
+def is_tokenizers_available():
+    return _tokenizers_available
+
+
+@lru_cache
+def is_vision_available():
+    _pil_available = importlib.util.find_spec("PIL") is not None
+    if _pil_available:
+        try:
+            package_version = importlib.metadata.version("Pillow")
+        except importlib.metadata.PackageNotFoundError:
+            try:
+                package_version = importlib.metadata.version("Pillow-SIMD")
+            except importlib.metadata.PackageNotFoundError:
+                return False
+        logger.debug(f"Detected PIL version {package_version}")
+    return _pil_available
+
+
+def is_pytesseract_available():
+    return _pytesseract_available
+
+
+def is_pytest_available():
+    return _pytest_available
+
+
+def is_spacy_available():
+    return _spacy_available
+
+
+def is_tensorflow_text_available():
+    return is_tf_available() and _tensorflow_text_available
+
+
+def is_keras_nlp_available():
+    return is_tensorflow_text_available() and _keras_nlp_available
+
+
+def is_in_notebook():
+    try:
+        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
+        get_ipython = sys.modules["IPython"].get_ipython
+        if "IPKernelApp" not in get_ipython().config:
+            raise ImportError("console")
+        # Removed the lines to include VSCode
+        if "DATABRICKS_RUNTIME_VERSION" in os.environ and os.environ["DATABRICKS_RUNTIME_VERSION"] < "11.0":
+            # Databricks Runtime 11.0 and above uses IPython kernel by default so it should be compatible with Jupyter notebook
+            # https://docs.microsoft.com/en-us/azure/databricks/notebooks/ipython-kernel
+            raise ImportError("databricks")
+
+        return importlib.util.find_spec("IPython") is not None
+    except (AttributeError, ImportError, KeyError):
+        return False
+
+
+def is_pytorch_quantization_available():
+    return _pytorch_quantization_available
+
+
+def is_tensorflow_probability_available():
+    return _tensorflow_probability_available
+
+
+def is_pandas_available():
+    return _pandas_available
+
+
+def is_sagemaker_dp_enabled():
+    # Get the sagemaker specific env variable.
+    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        sagemaker_params = json.loads(sagemaker_params)
+        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return _smdistributed_available
+
+
+def is_sagemaker_mp_enabled():
+    # Get the sagemaker specific mp parameters from smp_options variable.
+    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+    try:
+        # Parse it and check the field "partitions" is included, it is required for model parallel.
+        smp_options = json.loads(smp_options)
+        if "partitions" not in smp_options:
+            return False
+    except json.JSONDecodeError:
+        return False
+
+    # Get the sagemaker specific framework parameters from mpi_options variable.
+    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        mpi_options = json.loads(mpi_options)
+        if not mpi_options.get("sagemaker_mpi_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return _smdistributed_available
+
+
+def is_training_run_on_sagemaker():
+    return "SAGEMAKER_JOB_NAME" in os.environ
+
+
+def is_soundfile_available():
+    return _soundfile_available
+
+
+def is_timm_available():
+    return _timm_available
+
+
+def is_natten_available():
+    return _natten_available
+
+
+def is_nltk_available():
+    return _nltk_available
+
+
+def is_torchaudio_available():
+    return _torchaudio_available
+
+
+def is_torchao_available(min_version: str = TORCHAO_MIN_VERSION):
+    return _torchao_available and version.parse(_torchao_version) >= version.parse(min_version)
+
+
+def is_speech_available():
+    # For now this depends on torchaudio but the exact dependency might evolve in the future.
+    return _torchaudio_available
+
+
+def is_spqr_available():
+    return _spqr_available
+
+
+def is_phonemizer_available():
+    return _phonemizer_available
+
+
+def is_uroman_available():
+    return _uroman_available
+
+
+def torch_only_method(fn):
+    def wrapper(*args, **kwargs):
+        if not _torch_available:
+            raise ImportError(
+                "You need to install pytorch to use this method or class, "
+                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
+            )
+        else:
+            return fn(*args, **kwargs)
+
+    return wrapper
+
+
+def is_ccl_available():
+    return _is_ccl_available
+
+
+def is_sudachi_available():
+    return _sudachipy_available
+
+
+def get_sudachi_version():
+    return _sudachipy_version
+
+
+def is_sudachi_projection_available():
+    if not is_sudachi_available():
+        return False
+
+    # NOTE: We require sudachipy>=0.6.8 to use projection option in sudachi_kwargs for the constructor of BertJapaneseTokenizer.
+    # - `projection` option is not supported in sudachipy<0.6.8, see https://github.com/WorksApplications/sudachi.rs/issues/230
+    return version.parse(_sudachipy_version) >= version.parse("0.6.8")
+
+
+def is_jumanpp_available():
+    return (importlib.util.find_spec("rhoknp") is not None) and (shutil.which("jumanpp") is not None)
+
+
+def is_cython_available():
+    return importlib.util.find_spec("pyximport") is not None
+
+
+def is_jieba_available():
+    return _jieba_available
+
+
+def is_jinja_available():
+    return _jinja_available
+
+
+def is_mlx_available():
+    return _mlx_available
+
+
+def is_num2words_available():
+    return _num2words_available
+
+
+def is_tiktoken_available():
+    return _tiktoken_available and _blobfile_available
+
+
+def is_liger_kernel_available():
+    if not _liger_kernel_available:
+        return False
+
+    return version.parse(importlib.metadata.version("liger_kernel")) >= version.parse("0.3.0")
+
+
+def is_triton_available():
+    return _triton_available
+
+
+def is_rich_available():
+    return _rich_available
+
+
+# docstyle-ignore
+AV_IMPORT_ERROR = """
+{0} requires the PyAv library but it was not found in your environment. You can install it with:
+```
+pip install av
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+YT_DLP_IMPORT_ERROR = """
+{0} requires the YT-DLP library but it was not found in your environment. You can install it with:
+```
+pip install yt-dlp
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+DECORD_IMPORT_ERROR = """
+{0} requires the PyAv library but it was not found in your environment. You can install it with:
+```
+pip install decord
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+CV2_IMPORT_ERROR = """
+{0} requires the OpenCV library but it was not found in your environment. You can install it with:
+```
+pip install opencv-python
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+DATASETS_IMPORT_ERROR = """
+{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
+```
+pip install datasets
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install datasets
+```
+then restarting your kernel.
+
+Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
+working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
+that python file if that's the case. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TOKENIZERS_IMPORT_ERROR = """
+{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
+```
+pip install tokenizers
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install tokenizers
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SENTENCEPIECE_IMPORT_ERROR = """
+{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PROTOBUF_IMPORT_ERROR = """
+{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+FAISS_IMPORT_ERROR = """
+{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TORCHVISION_IMPORT_ERROR = """
+{0} requires the Torchvision library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR_WITH_TF = """
+{0} requires the PyTorch library but it was not found in your environment.
+However, we were able to find a TensorFlow installation. TensorFlow classes begin
+with "TF", but are otherwise identically named to our PyTorch classes. This
+means that the TF equivalent of the class you tried to import would be "TF{0}".
+If you want to use TensorFlow, please use TF classes instead!
+
+If you really do want to use PyTorch please go to
+https://pytorch.org/get-started/locally/ and follow the instructions that
+match your environment.
+"""
+
+# docstyle-ignore
+TF_IMPORT_ERROR_WITH_PYTORCH = """
+{0} requires the TensorFlow library but it was not found in your environment.
+However, we were able to find a PyTorch installation. PyTorch classes do not begin
+with "TF", but are otherwise identically named to our TF classes.
+If you want to use PyTorch, please use those classes instead!
+
+If you really do want to use TensorFlow, please follow the instructions on the
+installation page https://www.tensorflow.org/install that match your environment.
+"""
+
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SKLEARN_IMPORT_ERROR = """
+{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
+```
+pip install -U scikit-learn
+```
+In a notebook or a colab, you can install it by executing a cell with
+```
+!pip install -U scikit-learn
+```
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+TENSORFLOW_IMPORT_ERROR = """
+{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
+installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+DETECTRON2_IMPORT_ERROR = """
+{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+LEVENSHTEIN_IMPORT_ERROR = """
+{0} requires the python-Levenshtein library but it was not found in your environment. You can install it with pip: `pip
+install python-Levenshtein`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+G2P_EN_IMPORT_ERROR = """
+{0} requires the g2p-en library but it was not found in your environment. You can install it with pip:
+`pip install g2p-en`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYTORCH_QUANTIZATION_IMPORT_ERROR = """
+{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
+`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
+{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
+explained here: https://github.com/tensorflow/probability. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TENSORFLOW_TEXT_IMPORT_ERROR = """
+{0} requires the tensorflow_text library but it was not found in your environment. You can install it with pip as
+explained here: https://www.tensorflow.org/text/guide/tf_text_intro.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TORCHAUDIO_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. Please install it and restart your
+runtime.
+"""
+
+# docstyle-ignore
+PANDAS_IMPORT_ERROR = """
+{0} requires the pandas library but it was not found in your environment. You can install it with pip as
+explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PHONEMIZER_IMPORT_ERROR = """
+{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
+`pip install phonemizer`. Please note that you may need to restart your runtime after installation.
+"""
+# docstyle-ignore
+UROMAN_IMPORT_ERROR = """
+{0} requires the uroman library but it was not found in your environment. You can install it with pip:
+`pip install uroman`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+SACREMOSES_IMPORT_ERROR = """
+{0} requires the sacremoses library but it was not found in your environment. You can install it with pip:
+`pip install sacremoses`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip:
+`pip install scipy`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+KERAS_NLP_IMPORT_ERROR = """
+{0} requires the keras_nlp library but it was not found in your environment. You can install it with pip.
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+SPEECH_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TIMM_IMPORT_ERROR = """
+{0} requires the timm library but it was not found in your environment. You can install it with pip:
+`pip install timm`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+NATTEN_IMPORT_ERROR = """
+{0} requires the natten library but it was not found in your environment. You can install it by referring to:
+shi-labs.com/natten . You can also install it with pip (may take longer to build):
+`pip install natten`. Please note that you may need to restart your runtime after installation.
+"""
+
+NUMEXPR_IMPORT_ERROR = """
+{0} requires the numexpr library but it was not found in your environment. You can install it by referring to:
+https://numexpr.readthedocs.io/en/latest/index.html.
+"""
+
+
+# docstyle-ignore
+NLTK_IMPORT_ERROR = """
+{0} requires the NLTK library but it was not found in your environment. You can install it by referring to:
+https://www.nltk.org/install.html. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+VISION_IMPORT_ERROR = """
+{0} requires the PIL library but it was not found in your environment. You can install it with pip:
+`pip install pillow`. Please note that you may need to restart your runtime after installation.
+"""
+
+
+# docstyle-ignore
+PYTESSERACT_IMPORT_ERROR = """
+{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
+`pip install pytesseract`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PYCTCDECODE_IMPORT_ERROR = """
+{0} requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
+`pip install pyctcdecode`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+ACCELERATE_IMPORT_ERROR = """
+{0} requires the accelerate library >= {ACCELERATE_MIN_VERSION} it was not found in your environment.
+You can install or update it with pip: `pip install --upgrade accelerate`. Please note that you may need to restart your
+runtime after installation.
+"""
+
+# docstyle-ignore
+CCL_IMPORT_ERROR = """
+{0} requires the torch ccl library but it was not found in your environment. You can install it with pip:
+`pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+ESSENTIA_IMPORT_ERROR = """
+{0} requires essentia library. But that was not found in your environment. You can install them with pip:
+`pip install essentia==2.1b6.dev1034`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+LIBROSA_IMPORT_ERROR = """
+{0} requires thes librosa library. But that was not found in your environment. You can install them with pip:
+`pip install librosa`
+Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+PRETTY_MIDI_IMPORT_ERROR = """
+{0} requires thes pretty_midi library. But that was not found in your environment. You can install them with pip:
+`pip install pretty_midi`
+Please note that you may need to restart your runtime after installation.
+"""
+
+
+CYTHON_IMPORT_ERROR = """
+{0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
+Cython`. Please note that you may need to restart your runtime after installation.
+"""
+
+JIEBA_IMPORT_ERROR = """
+{0} requires the jieba library but it was not found in your environment. You can install it with pip: `pip install
+jieba`. Please note that you may need to restart your runtime after installation.
+"""
+
+PEFT_IMPORT_ERROR = """
+{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install
+peft`. Please note that you may need to restart your runtime after installation.
+"""
+
+JINJA_IMPORT_ERROR = """
+{0} requires the jinja library but it was not found in your environment. You can install it with pip: `pip install
+jinja2`. Please note that you may need to restart your runtime after installation.
+"""
+
+RICH_IMPORT_ERROR = """
+{0} requires the rich library but it was not found in your environment. You can install it with pip: `pip install
+rich`. Please note that you may need to restart your runtime after installation.
+"""
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("av", (is_av_available, AV_IMPORT_ERROR)),
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("cv2", (is_cv2_available, CV2_IMPORT_ERROR)),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
+        ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
+        ("essentia", (is_essentia_available, ESSENTIA_IMPORT_ERROR)),
+        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("g2p_en", (is_g2p_en_available, G2P_EN_IMPORT_ERROR)),
+        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
+        ("phonemizer", (is_phonemizer_available, PHONEMIZER_IMPORT_ERROR)),
+        ("uroman", (is_uroman_available, UROMAN_IMPORT_ERROR)),
+        ("pretty_midi", (is_pretty_midi_available, PRETTY_MIDI_IMPORT_ERROR)),
+        ("levenshtein", (is_levenshtein_available, LEVENSHTEIN_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("pyctcdecode", (is_pyctcdecode_available, PYCTCDECODE_IMPORT_ERROR)),
+        ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
+        ("sacremoses", (is_sacremoses_available, SACREMOSES_IMPORT_ERROR)),
+        ("pytorch_quantization", (is_pytorch_quantization_available, PYTORCH_QUANTIZATION_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
+        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+        ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
+        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("tensorflow_text", (is_tensorflow_text_available, TENSORFLOW_TEXT_IMPORT_ERROR)),
+        ("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
+        ("torchaudio", (is_torchaudio_available, TORCHAUDIO_IMPORT_ERROR)),
+        ("natten", (is_natten_available, NATTEN_IMPORT_ERROR)),
+        ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
+        ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("torchvision", (is_torchvision_available, TORCHVISION_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
+        ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
+        ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
+        ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
+        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
+        ("jinja", (is_jinja_available, JINJA_IMPORT_ERROR)),
+        ("yt_dlp", (is_yt_dlp_available, YT_DLP_IMPORT_ERROR)),
+        ("rich", (is_rich_available, RICH_IMPORT_ERROR)),
+        ("keras_nlp", (is_keras_nlp_available, KERAS_NLP_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+
+    # Raise an error for users who might not realize that classes without "TF" are torch-only
+    if "torch" in backends and "tf" not in backends and not is_torch_available() and is_tf_available():
+        raise ImportError(PYTORCH_IMPORT_ERROR_WITH_TF.format(name))
+
+    # Raise the inverse error for PyTorch users trying to load TF classes
+    if "tf" in backends and "torch" not in backends and is_torch_available() and not is_tf_available():
+        raise ImportError(TF_IMPORT_ERROR_WITH_PYTORCH.format(name))
+
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+
+    is_dummy = True
+
+    def __getattribute__(cls, key):
+        if (key.startswith("_") and key != "_from_config") or key == "is_dummy" or key == "mro" or key == "call":
+            return super().__getattribute__(key)
+        requires_backends(cls, cls._backends)
+
+
+def is_torch_fx_proxy(x):
+    if is_torch_fx_available():
+        import torch.fx
+
+        return isinstance(x, torch.fx.Proxy)
+    return False
+
+
+BACKENDS_T = FrozenSet[str]
+IMPORT_STRUCTURE_T = Dict[BACKENDS_T, Dict[str, Set[str]]]
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(
+        self,
+        name: str,
+        module_file: str,
+        import_structure: IMPORT_STRUCTURE_T,
+        module_spec: Optional[importlib.machinery.ModuleSpec] = None,
+        extra_objects: Dict[str, object] = None,
+    ):
+        super().__init__(name)
+
+        self._object_missing_backend = {}
+        if any(isinstance(key, frozenset) for key in import_structure.keys()):
+            self._modules = set()
+            self._class_to_module = {}
+            self.__all__ = []
+
+            _import_structure = {}
+
+            for backends, module in import_structure.items():
+                missing_backends = []
+
+                # This ensures that if a module is importable, then all other keys of the module are importable.
+                # As an example, in module.keys() we might have the following:
+                #
+                # dict_keys(['models.nllb_moe.configuration_nllb_moe', 'models.sew_d.configuration_sew_d'])
+                #
+                # with this, we don't only want to be able to import these explicitely, we want to be able to import
+                # every intermediate module as well. Therefore, this is what is returned:
+                #
+                # {
+                #     'models.nllb_moe.configuration_nllb_moe',
+                #     'models.sew_d.configuration_sew_d',
+                #     'models',
+                #     'models.sew_d', 'models.nllb_moe'
+                # }
+
+                module_keys = set(
+                    chain(*[[k.rsplit(".", i)[0] for i in range(k.count(".") + 1)] for k in list(module.keys())])
+                )
+                for backend in backends:
+                    if backend not in BACKENDS_MAPPING:
+                        raise ValueError(
+                            f"Error: the following backend: '{backend}' was specified around object {module} but isn't specified in the backends mapping."
+                        )
+                    callable, error = BACKENDS_MAPPING[backend]
+                    if not callable():
+                        missing_backends.append(backend)
+                self._modules = self._modules.union(module_keys)
+
+                for key, values in module.items():
+                    if len(missing_backends):
+                        self._object_missing_backend[key] = missing_backends
+
+                    for value in values:
+                        self._class_to_module[value] = key
+                        if len(missing_backends):
+                            self._object_missing_backend[value] = missing_backends
+                    _import_structure.setdefault(key, []).extend(values)
+
+                # Needed for autocompletion in an IDE
+                self.__all__.extend(module_keys | set(chain(*module.values())))
+
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = _import_structure
+
+        # This can be removed once every exportable object has a `require()` require.
+        else:
+            self._modules = set(import_structure.keys())
+            self._class_to_module = {}
+            for key, values in import_structure.items():
+                for value in values:
+                    self._class_to_module[value] = key
+            # Needed for autocompletion in an IDE
+            self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+            self.__file__ = module_file
+            self.__spec__ = module_spec
+            self.__path__ = [os.path.dirname(module_file)]
+            self._objects = {} if extra_objects is None else extra_objects
+            self._name = name
+            self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._object_missing_backend.keys():
+            missing_backends = self._object_missing_backend[name]
+
+            class Placeholder(metaclass=DummyObject):
+                _backends = missing_backends
+
+                def __init__(self, *args, **kwargs):
+                    requires_backends(self, missing_backends)
+
+                def call(self, *args, **kwargs):
+                    pass
+
+            Placeholder.__name__ = name
+
+            if name not in self._class_to_module:
+                module_name = f"transformers.{name}"
+            else:
+                module_name = self._class_to_module[name]
+                if not module_name.startswith("transformers."):
+                    module_name = f"transformers.{module_name}"
+
+            Placeholder.__module__ = module_name
+
+            value = Placeholder
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        elif name in self._modules:
+            value = self._get_module(name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
+
+
+class OptionalDependencyNotAvailable(BaseException):
+    """Internally used error class for signalling an optional dependency was not found."""
+
+
+def direct_transformers_import(path: str, file="__init__.py") -> ModuleType:
+    """Imports transformers directly
+
+    Args:
+        path (`str`): The path to the source file
+        file (`str`, *optional*): The file to join with the path. Defaults to "__init__.py".
+
+    Returns:
+        `ModuleType`: The resulting imported module
+    """
+    name = "transformers"
+    location = os.path.join(path, file)
+    spec = importlib.util.spec_from_file_location(name, location, submodule_search_locations=[path])
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    module = sys.modules[name]
+    return module
+
+
+def requires(*, backends=()):
+    """
+    This decorator enables two things:
+    - Attaching a `__backends` tuple to an object to see what are the necessary backends for it
+      to execute correctly without instantiating it
+    - The '@requires' string is used to dynamically import objects
+    """
+    for backend in backends:
+        if backend not in BACKENDS_MAPPING:
+            raise ValueError(f"Backend should be defined in the BACKENDS_MAPPING. Offending backend: {backend}")
+
+    if not isinstance(backends, tuple):
+        raise ValueError("Backends should be a tuple.")
+
+    def inner_fn(fun):
+        fun.__backends = backends
+        return fun
+
+    return inner_fn
+
+
+BASE_FILE_REQUIREMENTS = {
+    lambda e: "modeling_tf_" in e: ("tf",),
+    lambda e: "modeling_flax_" in e: ("flax",),
+    lambda e: "modeling_" in e: ("torch",),
+    lambda e: e.startswith("tokenization_") and e.endswith("_fast"): ("tokenizers",),
+    lambda e: e.startswith("image_processing_") and e.endswith("_fast"): ("vision", "torch", "torchvision"),
+    lambda e: e.startswith("image_processing_"): ("vision",),
+}
+
+
+def fetch__all__(file_content):
+    """
+    Returns the content of the __all__ variable in the file content.
+    Returns None if not defined, otherwise returns a list of strings.
+    """
+
+    if "__all__" not in file_content:
+        return []
+
+    start_index = None
+    lines = file_content.splitlines()
+    for index, line in enumerate(lines):
+        if line.startswith("__all__"):
+            start_index = index
+
+    # There is no line starting with `__all__`
+    if start_index is None:
+        return []
+
+    lines = lines[start_index:]
+
+    if not lines[0].startswith("__all__"):
+        raise ValueError(
+            "fetch__all__ accepts a list of lines, with the first line being the __all__ variable declaration"
+        )
+
+    # __all__ is defined on a single line
+    if lines[0].endswith("]"):
+        return [obj.strip("\"' ") for obj in lines[0].split("=")[1].strip(" []").split(",")]
+
+    # __all__ is defined on multiple lines
+    else:
+        _all = []
+        for __all__line_index in range(1, len(lines)):
+            if lines[__all__line_index].strip() == "]":
+                return _all
+            else:
+                _all.append(lines[__all__line_index].strip("\"', "))
+
+        return _all
+
+
+@lru_cache()
+def create_import_structure_from_path(module_path):
+    """
+    This method takes the path to a file/a folder and returns the import structure.
+    If a file is given, it will return the import structure of the parent folder.
+
+    Import structures are designed to be digestible by `_LazyModule` objects. They are
+    created from the __all__ definitions in each files as well as the `@require` decorators
+    above methods and objects.
+
+    The import structure allows explicit display of the required backends for a given object.
+    These backends are specified in two ways:
+
+    1. Through their `@require`, if they are exported with that decorator. This `@require` decorator
+       accepts a `backend` tuple kwarg mentioning which backends are required to run this object.
+
+    2. If an object is defined in a file with "default" backends, it will have, at a minimum, this
+       backend specified. The default backends are defined according to the filename:
+
+       - If a file is named like `modeling_*.py`, it will have a `torch` backend
+       - If a file is named like `modeling_tf_*.py`, it will have a `tf` backend
+       - If a file is named like `modeling_flax_*.py`, it will have a `flax` backend
+       - If a file is named like `tokenization_*_fast.py`, it will have a `tokenizers` backend
+       - If a file is named like `image_processing*_fast.py`, it will have a `torchvision` + `torch` backend
+
+    Backends serve the purpose of displaying a clear error message to the user in case the backends are not installed.
+    Should an object be imported without its required backends being in the environment, any attempt to use the
+    object will raise an error mentioning which backend(s) should be added to the environment in order to use
+    that object.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+    """
+    import_structure = {}
+
+    if os.path.isfile(module_path):
+        module_path = os.path.dirname(module_path)
+
+    directory = module_path
+    adjacent_modules = []
+
+    for f in os.listdir(module_path):
+        if f != "__pycache__" and os.path.isdir(os.path.join(module_path, f)):
+            import_structure[f] = create_import_structure_from_path(os.path.join(module_path, f))
+
+        elif not os.path.isdir(os.path.join(directory, f)):
+            adjacent_modules.append(f)
+
+    # We're only taking a look at files different from __init__.py
+    # We could theoretically require things directly from the __init__.py
+    # files, but this is not supported at this time.
+    if "__init__.py" in adjacent_modules:
+        adjacent_modules.remove("__init__.py")
+
+    # Modular files should not be imported
+    def find_substring(substring, list_):
+        return any(substring in x for x in list_)
+
+    if find_substring("modular_", adjacent_modules) and find_substring("modeling_", adjacent_modules):
+        adjacent_modules = [module for module in adjacent_modules if "modular_" not in module]
+
+    module_requirements = {}
+    for module_name in adjacent_modules:
+        # Only modules ending in `.py` are accepted here.
+        if not module_name.endswith(".py"):
+            continue
+
+        with open(os.path.join(directory, module_name), encoding="utf-8") as f:
+            file_content = f.read()
+
+        # Remove the .py suffix
+        module_name = module_name[:-3]
+
+        previous_line = ""
+        previous_index = 0
+
+        # Some files have some requirements by default.
+        # For example, any file named `modeling_tf_xxx.py`
+        # should have TensorFlow as a required backend.
+        base_requirements = ()
+        for string_check, requirements in BASE_FILE_REQUIREMENTS.items():
+            if string_check(module_name):
+                base_requirements = requirements
+                break
+
+        # Objects that have a `@require` assigned to them will get exported
+        # with the backends specified in the decorator as well as the file backends.
+        exported_objects = set()
+        if "@requires" in file_content:
+            lines = file_content.split("\n")
+            for index, line in enumerate(lines):
+                # This allows exporting items with other decorators. We'll take a look
+                # at the line that follows at the same indentation level.
+                if line.startswith((" ", "\t", "@", ")")) and not line.startswith("@requires"):
+                    continue
+
+                # Skipping line enables putting whatever we want between the
+                # export() call and the actual class/method definition.
+                # This is what enables having # Copied from statements, docs, etc.
+                skip_line = False
+
+                if "@requires" in previous_line:
+                    skip_line = False
+
+                    # Backends are defined on the same line as export
+                    if "backends" in previous_line:
+                        backends_string = previous_line.split("backends=")[1].split("(")[1].split(")")[0]
+                        backends = tuple(sorted([b.strip("'\",") for b in backends_string.split(", ") if b]))
+
+                    # Backends are defined in the lines following export, for example such as:
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece",
+                    #             "torch",
+                    #             "tf",
+                    #     )
+                    # )
+                    #
+                    # or
+                    #
+                    # @export(
+                    #     backends=(
+                    #             "sentencepiece", "tf"
+                    #     )
+                    # )
+                    elif "backends" in lines[previous_index + 1]:
+                        backends = []
+                        for backend_line in lines[previous_index:index]:
+                            if "backends" in backend_line:
+                                backend_line = backend_line.split("=")[1]
+                            if '"' in backend_line or "'" in backend_line:
+                                if ", " in backend_line:
+                                    backends.extend(backend.strip("()\"', ") for backend in backend_line.split(", "))
+                                else:
+                                    backends.append(backend_line.strip("()\"', "))
+
+                            # If the line is only a ')', then we reached the end of the backends and we break.
+                            if backend_line.strip() == ")":
+                                break
+                        backends = tuple(backends)
+
+                    # No backends are registered for export
+                    else:
+                        backends = ()
+
+                    backends = frozenset(backends + base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    if not line.startswith("class") and not line.startswith("def"):
+                        skip_line = True
+                    else:
+                        start_index = 6 if line.startswith("class") else 4
+                        object_name = line[start_index:].split("(")[0].strip(":")
+                        module_requirements[backends][module_name].add(object_name)
+                        exported_objects.add(object_name)
+
+                if not skip_line:
+                    previous_line = line
+                    previous_index = index
+
+        # All objects that are in __all__ should be exported by default.
+        # These objects are exported with the file backends.
+        if "__all__" in file_content:
+            for _all_object in fetch__all__(file_content):
+                if _all_object not in exported_objects:
+                    backends = frozenset(base_requirements)
+                    if backends not in module_requirements:
+                        module_requirements[backends] = {}
+                    if module_name not in module_requirements[backends]:
+                        module_requirements[backends][module_name] = set()
+
+                    module_requirements[backends][module_name].add(_all_object)
+
+    import_structure = {**module_requirements, **import_structure}
+    return import_structure
+
+
+def spread_import_structure(nested_import_structure):
+    """
+    This method takes as input an unordered import structure and brings the required backends at the top-level,
+    aggregating modules and objects under their required backends.
+
+    Here's an example of an input import structure at the src.transformers.models level:
+
+    {
+        'albert': {
+            frozenset(): {
+                'configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'}
+            },
+            frozenset({'tokenizers'}): {
+                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+            },
+        },
+        'align': {
+            frozenset(): {
+                'configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+                'processing_align': {'AlignProcessor'}
+            },
+        },
+        'altclip': {
+            frozenset(): {
+                'configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+                'processing_altclip': {'AltCLIPProcessor'},
+            }
+        }
+    }
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    """
+
+    def propagate_frozenset(unordered_import_structure):
+        tuple_first_import_structure = {}
+        for _key, _value in unordered_import_structure.items():
+            if not isinstance(_value, dict):
+                tuple_first_import_structure[_key] = _value
+
+            elif any(isinstance(v, frozenset) for v in _value.keys()):
+                # Here we want to switch around key and v
+                for k, v in _value.items():
+                    if isinstance(k, frozenset):
+                        if k not in tuple_first_import_structure:
+                            tuple_first_import_structure[k] = {}
+                        tuple_first_import_structure[k][_key] = v
+
+            else:
+                tuple_first_import_structure[_key] = propagate_frozenset(_value)
+
+        return tuple_first_import_structure
+
+    def flatten_dict(_dict, previous_key=None):
+        items = []
+        for _key, _value in _dict.items():
+            _key = f"{previous_key}.{_key}" if previous_key is not None else _key
+            if isinstance(_value, dict):
+                items.extend(flatten_dict(_value, _key).items())
+            else:
+                items.append((_key, _value))
+        return dict(items)
+
+    # The tuples contain the necessary backends. We want these first, so we propagate them up the
+    # import structure.
+    ordered_import_structure = nested_import_structure
+
+    # 6 is a number that gives us sufficient depth to go through all files and foreseeable folder depths
+    # while not taking too long to parse.
+    for i in range(6):
+        ordered_import_structure = propagate_frozenset(ordered_import_structure)
+
+    # We then flatten the dict so that it references a module path.
+    flattened_import_structure = {}
+    for key, value in ordered_import_structure.copy().items():
+        if isinstance(key, str):
+            del ordered_import_structure[key]
+        else:
+            flattened_import_structure[key] = flatten_dict(value)
+
+    return flattened_import_structure
+
+
+@lru_cache()
+def define_import_structure(module_path: str, prefix: str = None) -> IMPORT_STRUCTURE_T:
+    """
+    This method takes a module_path as input and creates an import structure digestible by a _LazyModule.
+
+    Here's an example of an output import structure at the src.transformers.models level:
+
+    {
+        frozenset({'tokenizers'}): {
+            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+        },
+        frozenset(): {
+            'albert.configuration_albert': {'AlbertConfig', 'AlbertOnnxConfig'},
+            'align.processing_align': {'AlignProcessor'},
+            'align.configuration_align': {'AlignConfig', 'AlignTextConfig', 'AlignVisionConfig'},
+            'altclip.configuration_altclip': {'AltCLIPConfig', 'AltCLIPTextConfig', 'AltCLIPVisionConfig'},
+            'altclip.processing_altclip': {'AltCLIPProcessor'}
+        }
+    }
+
+    The import structure is a dict defined with frozensets as keys, and dicts of strings to sets of objects.
+
+    If `prefix` is not None, it will add that prefix to all keys in the returned dict.
+    """
+    import_structure = create_import_structure_from_path(module_path)
+    spread_dict = spread_import_structure(import_structure)
+
+    if prefix is None:
+        return spread_dict
+    else:
+        spread_dict = {k: {f"{prefix}.{kk}": vv for kk, vv in v.items()} for k, v in spread_dict.items()}
+        return spread_dict
+
+
+def clear_import_cache():
+    """
+    Clear cached Transformers modules to allow reloading modified code.
+
+    This is useful when actively developing/modifying Transformers code.
+    """
+    # Get all transformers modules
+    transformers_modules = [mod_name for mod_name in sys.modules if mod_name.startswith("transformers.")]
+
+    # Remove them from sys.modules
+    for mod_name in transformers_modules:
+        module = sys.modules[mod_name]
+        # Clear _LazyModule caches if applicable
+        if isinstance(module, _LazyModule):
+            module._objects = {}  # Clear cached objects
+        del sys.modules[mod_name]
+
+    # Force reload main transformers module
+    if "transformers" in sys.modules:
+        main_module = sys.modules["transformers"]
+        if isinstance(main_module, _LazyModule):
+            main_module._objects = {}  # Clear cached objects
+        importlib.reload(main_module)
diff --git a/docs/transformers/build/lib/transformers/utils/logging.py b/docs/transformers/build/lib/transformers/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2915e167a0bcf9d61aeaca269608b98320c3c54
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/logging.py
@@ -0,0 +1,410 @@
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Logging utilities."""
+
+import functools
+import logging
+import os
+import sys
+import threading
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
+from logging import captureWarnings as _captureWarnings
+from typing import Optional
+
+import huggingface_hub.utils as hf_hub_utils
+from tqdm import auto as tqdm_lib
+
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "detail": logging.DEBUG,  # will also print filename and line number
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = not hf_hub_utils.are_progress_bars_disabled()
+
+
+def _get_default_logging_level():
+    """
+    If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, "
+                f"has to be one of: {', '.join(log_levels.keys())}"
+            )
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
+        if sys.stderr is None:
+            sys.stderr = open(os.devnull, "w")
+
+        _default_handler.flush = sys.stderr.flush
+
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        # if logging level is debug, we add pathname and lineno to formatter for easy debugging
+        if os.getenv("TRANSFORMERS_VERBOSITY", None) == "detail":
+            formatter = logging.Formatter("[%(levelname)s|%(pathname)s:%(lineno)s] %(asctime)s >> %(message)s")
+            _default_handler.setFormatter(formatter)
+
+        is_ci = os.getenv("CI") is not None and os.getenv("CI").upper() in {"1", "ON", "YES", "TRUE"}
+        library_root_logger.propagate = True if is_ci else False
+
+
+def _reset_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if not _default_handler:
+            return
+
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+
+
+def get_log_levels_dict():
+    return log_levels
+
+
+def captureWarnings(capture):
+    """
+    Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
+    `warnings` library.
+
+    Read more about this method here:
+    https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
+
+    All warnings will be logged through the `py.warnings` logger.
+
+    Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
+    level of that logger to the library's root logger.
+    """
+    logger = get_logger("py.warnings")
+
+    if not logger.handlers:
+        logger.addHandler(_default_handler)
+
+    logger.setLevel(_get_library_root_logger().level)
+
+    _captureWarnings(capture)
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+    """
+
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Transformers's root logger as an int.
+
+    Returns:
+        `int`: The logging level.
+
+    <Tip>
+
+    🤗 Transformers has following logging levels:
+
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""
+
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Transformers's root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level, e.g., one of:
+
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+    """Disable the default handler of the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+    """Enable the default handler of the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Transformers's root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None and handler not in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+    prevent double logging if the root logger has been configured.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+    ```
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+    """
+    Resets the formatting for HuggingFace Transformers's loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+@functools.lru_cache(None)
+def warning_once(self, *args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_once = warning_once
+
+
+@functools.lru_cache(None)
+def info_once(self, *args, **kwargs):
+    """
+    This method is identical to `logger.info()`, but will emit the info with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    self.info(*args, **kwargs)
+
+
+logging.Logger.info_once = info_once
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+
+        return empty_fn
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        return
+
+
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+    hf_hub_utils.enable_progress_bars()
+
+
+def disable_progress_bar():
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False
+    hf_hub_utils.disable_progress_bars()
diff --git a/docs/transformers/build/lib/transformers/utils/model_parallel_utils.py b/docs/transformers/build/lib/transformers/utils/model_parallel_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db16b70a75ccdce7bd9c07c5dfc99f42e5c8807
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/model_parallel_utils.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import ceil
+
+
+def assert_device_map(device_map, num_blocks):
+    blocks = list(range(0, num_blocks))
+
+    device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist]
+
+    # Duplicate check
+    duplicate_blocks = []
+    for i in device_map_blocks:
+        if device_map_blocks.count(i) > 1 and i not in duplicate_blocks:
+            duplicate_blocks.append(i)
+    # Missing blocks
+    missing_blocks = [i for i in blocks if i not in device_map_blocks]
+    extra_blocks = [i for i in device_map_blocks if i not in blocks]
+
+    if len(duplicate_blocks) != 0:
+        raise ValueError(
+            "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device."
+            " These attention blocks were specified more than once: " + str(duplicate_blocks)
+        )
+    if len(missing_blocks) != 0:
+        raise ValueError(
+            "There are attention blocks for this model that are not specified in the device_map. Add these attention "
+            "blocks to a device on the device_map: " + str(missing_blocks)
+        )
+    if len(extra_blocks) != 0:
+        raise ValueError(
+            "The device_map contains more attention blocks than this model has. Remove these from the device_map:"
+            + str(extra_blocks)
+        )
+
+
+def get_device_map(n_layers, devices):
+    """Returns a dictionary of layers distributed evenly across all devices."""
+    layers = list(range(n_layers))
+    n_blocks = int(ceil(n_layers / len(devices)))
+    layers_list = [layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)]
+
+    return dict(zip(devices, layers_list))
diff --git a/docs/transformers/build/lib/transformers/utils/notebook.py b/docs/transformers/build/lib/transformers/utils/notebook.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a44d858e731f9bd4067d373165b0b376f5191b
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/notebook.py
@@ -0,0 +1,383 @@
+# Copyright 2020 Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import time
+from typing import Optional
+
+import IPython.display as disp
+
+from ..trainer_callback import TrainerCallback
+from ..trainer_utils import IntervalStrategy, has_length
+
+
+def format_time(t):
+    "Format `t` (in seconds) to (h):mm:ss"
+    t = int(t)
+    h, m, s = t // 3600, (t // 60) % 60, t % 60
+    return f"{h}:{m:02d}:{s:02d}" if h != 0 else f"{m:02d}:{s:02d}"
+
+
+def html_progress_bar(value, total, prefix, label, width=300):
+    # docstyle-ignore
+    return f"""
+    <div>
+      {prefix}
+      <progress value='{value}' max='{total}' style='width:{width}px; height:20px; vertical-align: middle;'></progress>
+      {label}
+    </div>
+    """
+
+
+def text_to_html_table(items):
+    "Put the texts in `items` in an HTML table."
+    html_code = """<table border="1" class="dataframe">\n"""
+    html_code += """  <thead>\n <tr style="text-align: left;">\n"""
+    for i in items[0]:
+        html_code += f"      <th>{i}</th>\n"
+    html_code += "    </tr>\n  </thead>\n  <tbody>\n"
+    for line in items[1:]:
+        html_code += "    <tr>\n"
+        for elt in line:
+            elt = f"{elt:.6f}" if isinstance(elt, float) else str(elt)
+            html_code += f"      <td>{elt}</td>\n"
+        html_code += "    </tr>\n"
+    html_code += "  </tbody>\n</table><p>"
+    return html_code
+
+
+class NotebookProgressBar:
+    """
+    A progress par for display in a notebook.
+
+    Class attributes (overridden by derived classes)
+
+        - **warmup** (`int`) -- The number of iterations to do at the beginning while ignoring `update_every`.
+        - **update_every** (`float`) -- Since calling the time takes some time, we only do it every presumed
+          `update_every` seconds. The progress bar uses the average time passed up until now to guess the next value
+          for which it will call the update.
+
+    Args:
+        total (`int`):
+            The total number of iterations to reach.
+        prefix (`str`, *optional*):
+            A prefix to add before the progress bar.
+        leave (`bool`, *optional*, defaults to `True`):
+            Whether or not to leave the progress bar once it's completed. You can always call the
+            [`~utils.notebook.NotebookProgressBar.close`] method to make the bar disappear.
+        parent ([`~notebook.NotebookTrainingTracker`], *optional*):
+            A parent object (like [`~utils.notebook.NotebookTrainingTracker`]) that spawns progress bars and handle
+            their display. If set, the object passed must have a `display()` method.
+        width (`int`, *optional*, defaults to 300):
+            The width (in pixels) that the bar will take.
+
+    Example:
+
+    ```python
+    import time
+
+    pbar = NotebookProgressBar(100)
+    for val in range(100):
+        pbar.update(val)
+        time.sleep(0.07)
+    pbar.update(100)
+    ```"""
+
+    warmup = 5
+    update_every = 0.2
+
+    def __init__(
+        self,
+        total: int,
+        prefix: Optional[str] = None,
+        leave: bool = True,
+        parent: Optional["NotebookTrainingTracker"] = None,
+        width: int = 300,
+    ):
+        self.total = total
+        self.prefix = "" if prefix is None else prefix
+        self.leave = leave
+        self.parent = parent
+        self.width = width
+        self.last_value = None
+        self.comment = None
+        self.output = None
+        self.value = None
+        self.label = None
+        if "VSCODE_PID" in os.environ:
+            self.update_every = 0.5  # Adjusted for smooth updated as html rending is slow on VS Code
+            # This is the only adjustment required to optimize training html rending
+
+    def update(self, value: int, force_update: bool = False, comment: Optional[str] = None):
+        """
+        The main method to update the progress bar to `value`.
+
+        Args:
+            value (`int`):
+                The value to use. Must be between 0 and `total`.
+            force_update (`bool`, *optional*, defaults to `False`):
+                Whether or not to force and update of the internal state and display (by default, the bar will wait for
+                `value` to reach the value it predicted corresponds to a time of more than the `update_every` attribute
+                since the last update to avoid adding boilerplate).
+            comment (`str`, *optional*):
+                A comment to add on the left of the progress bar.
+        """
+        self.value = value
+        if comment is not None:
+            self.comment = comment
+        if self.last_value is None:
+            self.start_time = self.last_time = time.time()
+            self.start_value = self.last_value = value
+            self.elapsed_time = self.predicted_remaining = None
+            self.first_calls = self.warmup
+            self.wait_for = 1
+            self.update_bar(value)
+        elif value <= self.last_value and not force_update:
+            return
+        elif force_update or self.first_calls > 0 or value >= min(self.last_value + self.wait_for, self.total):
+            if self.first_calls > 0:
+                self.first_calls -= 1
+            current_time = time.time()
+            self.elapsed_time = current_time - self.start_time
+            # We could have value = self.start_value if the update is called twixe with the same start value.
+            if value > self.start_value:
+                self.average_time_per_item = self.elapsed_time / (value - self.start_value)
+            else:
+                self.average_time_per_item = None
+            if value >= self.total:
+                value = self.total
+                self.predicted_remaining = None
+                if not self.leave:
+                    self.close()
+            elif self.average_time_per_item is not None:
+                self.predicted_remaining = self.average_time_per_item * (self.total - value)
+            self.update_bar(value)
+            self.last_value = value
+            self.last_time = current_time
+            if (self.average_time_per_item is None) or (self.average_time_per_item == 0):
+                self.wait_for = 1
+            else:
+                self.wait_for = max(int(self.update_every / self.average_time_per_item), 1)
+
+    def update_bar(self, value, comment=None):
+        spaced_value = " " * (len(str(self.total)) - len(str(value))) + str(value)
+        if self.elapsed_time is None:
+            self.label = f"[{spaced_value}/{self.total} : < :"
+        elif self.predicted_remaining is None:
+            self.label = f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)}"
+        else:
+            self.label = (
+                f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)} <"
+                f" {format_time(self.predicted_remaining)}"
+            )
+            if self.average_time_per_item == 0:
+                self.label += ", +inf it/s"
+            else:
+                self.label += f", {1 / self.average_time_per_item:.2f} it/s"
+
+        self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
+        self.display()
+
+    def display(self):
+        self.html_code = html_progress_bar(self.value, self.total, self.prefix, self.label, self.width)
+        if self.parent is not None:
+            # If this is a child bar, the parent will take care of the display.
+            self.parent.display()
+            return
+        if self.output is None:
+            self.output = disp.display(disp.HTML(self.html_code), display_id=True)
+        else:
+            self.output.update(disp.HTML(self.html_code))
+
+    def close(self):
+        "Closes the progress bar."
+        if self.parent is None and self.output is not None:
+            self.output.update(disp.HTML(""))
+
+
+class NotebookTrainingTracker(NotebookProgressBar):
+    """
+    An object tracking the updates of an ongoing training with progress bars and a nice table reporting metrics.
+
+    Args:
+        num_steps (`int`): The number of steps during training. column_names (`List[str]`, *optional*):
+            The list of column names for the metrics table (will be inferred from the first call to
+            [`~utils.notebook.NotebookTrainingTracker.write_line`] if not set).
+    """
+
+    def __init__(self, num_steps, column_names=None):
+        super().__init__(num_steps)
+        self.inner_table = None if column_names is None else [column_names]
+        self.child_bar = None
+
+    def display(self):
+        self.html_code = html_progress_bar(self.value, self.total, self.prefix, self.label, self.width)
+        if self.inner_table is not None:
+            self.html_code += text_to_html_table(self.inner_table)
+        if self.child_bar is not None:
+            self.html_code += self.child_bar.html_code
+        if self.output is None:
+            self.output = disp.display(disp.HTML(self.html_code), display_id=True)
+        else:
+            self.output.update(disp.HTML(self.html_code))
+
+    def write_line(self, values):
+        """
+        Write the values in the inner table.
+
+        Args:
+            values (`Dict[str, float]`): The values to display.
+        """
+        if self.inner_table is None:
+            self.inner_table = [list(values.keys()), list(values.values())]
+        else:
+            columns = self.inner_table[0]
+            for key in values.keys():
+                if key not in columns:
+                    columns.append(key)
+            self.inner_table[0] = columns
+            if len(self.inner_table) > 1:
+                last_values = self.inner_table[-1]
+                first_column = self.inner_table[0][0]
+                if last_values[0] != values[first_column]:
+                    # write new line
+                    self.inner_table.append([values[c] if c in values else "No Log" for c in columns])
+                else:
+                    # update last line
+                    new_values = values
+                    for c in columns:
+                        if c not in new_values.keys():
+                            new_values[c] = last_values[columns.index(c)]
+                    self.inner_table[-1] = [new_values[c] for c in columns]
+            else:
+                self.inner_table.append([values[c] for c in columns])
+
+    def add_child(self, total, prefix=None, width=300):
+        """
+        Add a child progress bar displayed under the table of metrics. The child progress bar is returned (so it can be
+        easily updated).
+
+        Args:
+            total (`int`): The number of iterations for the child progress bar.
+            prefix (`str`, *optional*): A prefix to write on the left of the progress bar.
+            width (`int`, *optional*, defaults to 300): The width (in pixels) of the progress bar.
+        """
+        self.child_bar = NotebookProgressBar(total, prefix=prefix, parent=self, width=width)
+        return self.child_bar
+
+    def remove_child(self):
+        """
+        Closes the child progress bar.
+        """
+        self.child_bar = None
+        self.display()
+
+
+class NotebookProgressCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that displays the progress of training or evaluation, optimized for Jupyter Notebooks or
+    Google colab.
+    """
+
+    def __init__(self):
+        self.training_tracker = None
+        self.prediction_bar = None
+        self._force_next_update = False
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        self.first_column = "Epoch" if args.eval_strategy == IntervalStrategy.EPOCH else "Step"
+        self.training_loss = 0
+        self.last_log = 0
+        column_names = [self.first_column] + ["Training Loss"]
+        if args.eval_strategy != IntervalStrategy.NO:
+            column_names.append("Validation Loss")
+        self.training_tracker = NotebookTrainingTracker(state.max_steps, column_names)
+
+    def on_step_end(self, args, state, control, **kwargs):
+        epoch = int(state.epoch) if int(state.epoch) == state.epoch else f"{state.epoch:.2f}"
+        self.training_tracker.update(
+            state.global_step + 1,
+            comment=f"Epoch {epoch}/{state.num_train_epochs}",
+            force_update=self._force_next_update,
+        )
+        self._force_next_update = False
+
+    def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
+        if not has_length(eval_dataloader):
+            return
+        if self.prediction_bar is None:
+            if self.training_tracker is not None:
+                self.prediction_bar = self.training_tracker.add_child(len(eval_dataloader))
+            else:
+                self.prediction_bar = NotebookProgressBar(len(eval_dataloader))
+            self.prediction_bar.update(1)
+        else:
+            self.prediction_bar.update(self.prediction_bar.value + 1)
+
+    def on_predict(self, args, state, control, **kwargs):
+        if self.prediction_bar is not None:
+            self.prediction_bar.close()
+        self.prediction_bar = None
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        # Only for when there is no evaluation
+        if args.eval_strategy == IntervalStrategy.NO and "loss" in logs:
+            values = {"Training Loss": logs["loss"]}
+            # First column is necessarily Step sine we're not in epoch eval strategy
+            values["Step"] = state.global_step
+            self.training_tracker.write_line(values)
+
+    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
+        if self.training_tracker is not None:
+            values = {"Training Loss": "No log", "Validation Loss": "No log"}
+            for log in reversed(state.log_history):
+                if "loss" in log:
+                    values["Training Loss"] = log["loss"]
+                    break
+
+            if self.first_column == "Epoch":
+                values["Epoch"] = int(state.epoch)
+            else:
+                values["Step"] = state.global_step
+            metric_key_prefix = "eval"
+            for k in metrics:
+                if k.endswith("_loss"):
+                    metric_key_prefix = re.sub(r"\_loss$", "", k)
+            _ = metrics.pop("total_flos", None)
+            _ = metrics.pop("epoch", None)
+            _ = metrics.pop(f"{metric_key_prefix}_runtime", None)
+            _ = metrics.pop(f"{metric_key_prefix}_samples_per_second", None)
+            _ = metrics.pop(f"{metric_key_prefix}_steps_per_second", None)
+            _ = metrics.pop(f"{metric_key_prefix}_jit_compilation_time", None)
+            for k, v in metrics.items():
+                splits = k.split("_")
+                name = " ".join([part.capitalize() for part in splits[1:]])
+                if name == "Loss":
+                    # Single dataset
+                    name = "Validation Loss"
+                values[name] = v
+            self.training_tracker.write_line(values)
+            self.training_tracker.remove_child()
+            self.prediction_bar = None
+            # Evaluation takes a long time so we should force the next update.
+            self._force_next_update = True
+
+    def on_train_end(self, args, state, control, **kwargs):
+        self.training_tracker.update(
+            state.global_step,
+            comment=f"Epoch {int(state.epoch)}/{state.num_train_epochs}",
+            force_update=True,
+        )
+        self.training_tracker = None
diff --git a/docs/transformers/build/lib/transformers/utils/peft_utils.py b/docs/transformers/build/lib/transformers/utils/peft_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3eb62a099059d4c1acd9eef75f94ee5ac22fd60f
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/peft_utils.py
@@ -0,0 +1,125 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+from typing import Optional, Union
+
+from packaging import version
+
+from .hub import cached_file
+from .import_utils import is_peft_available
+
+
+ADAPTER_CONFIG_NAME = "adapter_config.json"
+ADAPTER_WEIGHTS_NAME = "adapter_model.bin"
+ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors"
+
+
+def find_adapter_config_file(
+    model_id: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: Optional[bool] = None,
+    proxies: Optional[dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    _commit_hash: Optional[str] = None,
+) -> Optional[str]:
+    r"""
+    Simply checks if the model stored on the Hub or locally is an adapter model or not, return the path of the adapter
+    config file if it is, None otherwise.
+
+    Args:
+        model_id (`str`):
+            The identifier of the model to look for, can be either a local path or an id to the repository on the Hub.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download:
+            Deprecated and ignored. All downloads are now resumed by default when possible.
+            Will be removed in v5 of Transformers.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+
+            <Tip>
+
+            To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+            </Tip>
+
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+    """
+    adapter_cached_filename = None
+    if model_id is None:
+        return None
+    elif os.path.isdir(model_id):
+        list_remote_files = os.listdir(model_id)
+        if ADAPTER_CONFIG_NAME in list_remote_files:
+            adapter_cached_filename = os.path.join(model_id, ADAPTER_CONFIG_NAME)
+    else:
+        adapter_cached_filename = cached_file(
+            model_id,
+            ADAPTER_CONFIG_NAME,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            subfolder=subfolder,
+            _commit_hash=_commit_hash,
+            _raise_exceptions_for_gated_repo=False,
+            _raise_exceptions_for_missing_entries=False,
+            _raise_exceptions_for_connection_errors=False,
+        )
+
+    return adapter_cached_filename
+
+
+def check_peft_version(min_version: str) -> None:
+    r"""
+    Checks if the version of PEFT is compatible.
+
+    Args:
+        version (`str`):
+            The version of PEFT to check against.
+    """
+    if not is_peft_available():
+        raise ValueError("PEFT is not installed. Please install it with `pip install peft`")
+
+    is_peft_version_compatible = version.parse(importlib.metadata.version("peft")) >= version.parse(min_version)
+
+    if not is_peft_version_compatible:
+        raise ValueError(
+            f"The version of PEFT you are using is not compatible, please use a version that is greater"
+            f" than {min_version}"
+        )
diff --git a/docs/transformers/build/lib/transformers/utils/quantization_config.py b/docs/transformers/build/lib/transformers/utils/quantization_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f17b3881ada393af808b79e35f99654c358ef72
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/quantization_config.py
@@ -0,0 +1,1922 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+# Modifications Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import dataclasses
+import importlib.metadata
+import json
+import os
+from dataclasses import dataclass, is_dataclass
+from enum import Enum
+from inspect import Parameter, signature
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from packaging import version
+
+from ..utils import (
+    is_auto_awq_available,
+    is_compressed_tensors_available,
+    is_gptqmodel_available,
+    is_hqq_available,
+    is_quark_available,
+    is_torch_available,
+    is_torchao_available,
+    logging,
+)
+from .import_utils import is_auto_gptq_available
+
+
+if is_torch_available():
+    import torch
+
+logger = logging.get_logger(__name__)
+
+
+class QuantizationMethod(str, Enum):
+    BITS_AND_BYTES = "bitsandbytes"
+    GPTQ = "gptq"
+    AWQ = "awq"
+    AQLM = "aqlm"
+    VPTQ = "vptq"
+    QUANTO = "quanto"
+    EETQ = "eetq"
+    HIGGS = "higgs"
+    HQQ = "hqq"
+    COMPRESSED_TENSORS = "compressed-tensors"
+    FBGEMM_FP8 = "fbgemm_fp8"
+    TORCHAO = "torchao"
+    BITNET = "bitnet"
+    SPQR = "spqr"
+    FP8 = "fp8"
+    QUARK = "quark"
+    AUTOROUND = "auto-round"
+
+
+class AWQLinearVersion(str, Enum):
+    GEMM = "gemm"
+    GEMV = "gemv"
+    EXLLAMA = "exllama"
+    IPEX = "ipex"
+
+    @staticmethod
+    def from_str(version: str):
+        version = version.lower()
+        if version == "gemm":
+            return AWQLinearVersion.GEMM
+        elif version == "gemv":
+            return AWQLinearVersion.GEMV
+        elif version == "exllama":
+            return AWQLinearVersion.EXLLAMA
+        elif version == "ipex":
+            return AWQLinearVersion.IPEX
+        else:
+            raise ValueError(f"Unknown AWQLinearVersion {version}")
+
+
+class AwqBackendPackingMethod(str, Enum):
+    AUTOAWQ = "autoawq"
+    LLMAWQ = "llm-awq"
+
+
+@dataclass
+class QuantizationConfigMixin:
+    """
+    Mixin class for quantization config
+    """
+
+    quant_method: QuantizationMethod
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates a [`QuantizationConfigMixin`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
+        """
+        config = cls(**config_dict)
+
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `QuantizationConfig()` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            config_dict = self.to_dict()
+            json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+            writer.write(json_string)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return copy.deepcopy(self.__dict__)
+
+    def __iter__(self):
+        """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin"""
+        for attr, value in copy.deepcopy(self.__dict__).items():
+            yield attr, value
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def update(self, **kwargs):
+        """
+        Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes,
+        returning all the unused kwargs.
+
+        Args:
+            kwargs (`Dict[str, Any]`):
+                Dictionary of attributes to tentatively update this class.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+        """
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+                to_remove.append(key)
+
+        # Remove all the attributes that were updated, without modifying the input dict
+        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
+        return unused_kwargs
+
+
+@dataclass
+class AutoRoundConfig(QuantizationConfigMixin):
+    """This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded AutoRound quantization.
+
+    Args:
+        bits (`int`, *optional*, defaults to 4):
+            The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+        group_size (`int`, *optional*, defaults to 128): Group-size value
+        sym (`bool`, *optional*, defaults to `True`): Symmetric quantization or not
+        backend (`str`, *optional*, defaults to `"auto"`): The kernel to use, e.g., ipex,marlin, exllamav2, triton, etc. Ref. https://github.com/intel/auto-round?tab=readme-ov-file#specify-backend
+    """
+
+    def __init__(
+        self,
+        bits: int = 4,
+        group_size: int = 128,
+        sym: bool = True,
+        backend: str = "auto",
+        **kwargs,
+    ):
+        self.bits = bits
+        self.group_size = group_size
+        self.sym = sym
+        self.backend = backend
+        self.packing_format = "auto_round:gptq"
+        if kwargs is not None:
+            for key, value in kwargs.items():
+                setattr(self, key, value)
+        self.quant_method = QuantizationMethod.AUTOROUND
+        self.post_init()
+
+    def post_init(self):
+        r"""Safety checker that arguments are correct."""
+        if self.bits not in [2, 3, 4, 8]:
+            raise ValueError(f"Only support quantization to [2,3,4,8] bits but found {self.bits}")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+
+    def get_loading_attributes(self):
+        loading_attibutes_dict = {"backend": self.backend}
+        return loading_attibutes_dict
+
+    def to_dict(self):
+        config_dict = super().to_dict()
+        return config_dict
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        quant_method = config_dict["quant_method"]
+        if "auto-round" not in quant_method and "gptq" not in quant_method and "awq" not in quant_method:
+            raise NotImplementedError(
+                "Failed to convert to auto_round format. Only `gptqv1`, `awq`, and `auto-round` formats are supported."
+            )
+
+        if "gptq" in quant_method and "meta" in config_dict:
+            raise NotImplementedError("Failed to convert gptq format to auto_round format. Only supports `gptqv1`")
+
+        if "awq" in quant_method and config_dict.get("version", "gemm") != "gemm":
+            raise NotImplementedError(
+                "Failed to convert awq format to auto_round format. Only supports awq format with gemm version"
+            )
+
+        if "auto-round" not in quant_method:
+            config_dict["packing_format"] = f"auto_round:{quant_method}"
+
+        return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+
+@dataclass
+class HqqConfig(QuantizationConfigMixin):
+    """
+    This is wrapper around hqq's BaseQuantizeConfig.
+
+    Args:
+        nbits (`int`, *optional*, defaults to 4):
+            Number of bits. Supported values are (8, 4, 3, 2, 1).
+        group_size (`int`, *optional*, defaults to 64):
+            Group-size value. Supported values are any value that is divisble by weight.shape[axis]).
+        view_as_float (`bool`, *optional*, defaults to `False`):
+            View the quantized weight as float (used in distributed training) if set to `True`.
+        axis (`Optional[int]`, *optional*):
+            Axis along which grouping is performed. Supported values are 0 or 1.
+        dynamic_config (dict, *optional*):
+            Parameters for dynamic configuration. The key is the name tag of the layer and the value is a quantization config.
+            If set, each layer specified by its id will use its dedicated quantization configuration.
+        skip_modules (`List[str]`, *optional*, defaults to `['lm_head']`):
+            List of `nn.Linear` layers to skip.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        nbits: int = 4,
+        group_size: int = 64,
+        view_as_float: bool = False,
+        axis: Optional[int] = None,
+        dynamic_config: Optional[dict] = None,
+        skip_modules: List[str] = ["lm_head"],
+        **kwargs,
+    ):
+        if is_hqq_available():
+            from hqq.core.quantize import BaseQuantizeConfig as HQQBaseQuantizeConfig
+        else:
+            raise ImportError(
+                "A valid HQQ version (>=0.2.1) is not available. Please follow the instructions to install it: `https://github.com/mobiusml/hqq/`."
+            )
+
+        for deprecated_key in ["quant_zero", "quant_scale", "offload_meta"]:
+            if deprecated_key in kwargs:
+                logger.info(
+                    deprecated_key + " is deprecated. This parameter will be ignored in quantization settings."
+                )
+
+        if axis is None:
+            axis = 1
+            logger.info("Setting axis=1 as faster backends such as TorchAO or BitBlas are only compatible with it.")
+
+        if axis not in [0, 1]:
+            raise ValueError("Invalid axis value. Only 0 and 1 are allowed.")
+
+        if dynamic_config is not None:
+            self.quant_config = {}
+            for key in dynamic_config:
+                self.quant_config[key] = HQQBaseQuantizeConfig(**dynamic_config[key])
+        else:
+            self.quant_config = HQQBaseQuantizeConfig(
+                **{
+                    "nbits": nbits,
+                    "group_size": group_size,
+                    "view_as_float": view_as_float,
+                    "axis": axis,
+                }
+            )
+
+        self.quant_method = QuantizationMethod.HQQ
+        self.skip_modules = skip_modules
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        pass
+
+    @classmethod
+    def from_dict(cls, config: Dict[str, Any]):
+        """
+        Override from_dict, used in AutoQuantizationConfig.from_dict in quantizers/auto.py
+        """
+        instance = cls()
+        instance.quant_config = config["quant_config"]
+        instance.skip_modules = config["skip_modules"]
+        return instance
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        return {
+            "quant_config": self.quant_config,
+            "quant_method": self.quant_method,
+            "skip_modules": self.skip_modules,
+        }
+
+    def __repr__(self):
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = HqqConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+
+@dataclass
+class BitsAndBytesConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `bitsandbytes`.
+
+    This replaces `load_in_8bit` or `load_in_4bit`therefore both options are mutually exclusive.
+
+    Currently only supports `LLM.int8()`, `FP4`, and `NF4` quantization. If more methods are added to `bitsandbytes`,
+    then more arguments will be added to this class.
+
+    Args:
+        load_in_8bit (`bool`, *optional*, defaults to `False`):
+            This flag is used to enable 8-bit quantization with LLM.int8().
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
+            `bitsandbytes`.
+        llm_int8_threshold (`float`, *optional*, defaults to 6.0):
+            This corresponds to the outlier threshold for outlier detection as described in `LLM.int8() : 8-bit Matrix
+            Multiplication for Transformers at Scale` paper: https://arxiv.org/abs/2208.07339 Any hidden states value
+            that is above this threshold will be considered an outlier and the operation on those values will be done
+            in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but
+            there are some exceptional systematic outliers that are very differently distributed for large models.
+            These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of
+            magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6,
+            but a lower threshold might be needed for more unstable models (small models, fine-tuning).
+        llm_int8_skip_modules (`List[str]`, *optional*):
+            An explicit list of the modules that we do not want to convert in 8-bit. This is useful for models such as
+            Jukebox that has several heads in different places and not necessarily at the last position. For example
+            for `CausalLM` models, the last `lm_head` is kept in its original `dtype`.
+        llm_int8_enable_fp32_cpu_offload (`bool`, *optional*, defaults to `False`):
+            This flag is used for advanced use cases and users that are aware of this feature. If you want to split
+            your model in different parts and run some parts in int8 on GPU and some parts in fp32 on CPU, you can use
+            this flag. This is useful for offloading large models such as `google/flan-t5-xxl`. Note that the int8
+            operations will not be run on CPU.
+        llm_int8_has_fp16_weight (`bool`, *optional*, defaults to `False`):
+            This flag runs LLM.int8() with 16-bit main weights. This is useful for fine-tuning as the weights do not
+            have to be converted back and forth for the backward pass.
+        bnb_4bit_compute_dtype (`torch.dtype` or str, *optional*, defaults to `torch.float32`):
+            This sets the computational type which might be different than the input type. For example, inputs might be
+            fp32, but computation can be set to bf16 for speedups.
+        bnb_4bit_quant_type (`str`,  *optional*, defaults to `"fp4"`):
+            This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
+            which are specified by `fp4` or `nf4`.
+        bnb_4bit_use_double_quant (`bool`, *optional*, defaults to `False`):
+            This flag is used for nested quantization where the quantization constants from the first quantization are
+            quantized again.
+        bnb_4bit_quant_storage (`torch.dtype` or str, *optional*, defaults to `torch.uint8`):
+            This sets the storage type to pack the quanitzed 4-bit prarams.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        load_in_8bit=False,
+        load_in_4bit=False,
+        llm_int8_threshold=6.0,
+        llm_int8_skip_modules=None,
+        llm_int8_enable_fp32_cpu_offload=False,
+        llm_int8_has_fp16_weight=False,
+        bnb_4bit_compute_dtype=None,
+        bnb_4bit_quant_type="fp4",
+        bnb_4bit_use_double_quant=False,
+        bnb_4bit_quant_storage=None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.BITS_AND_BYTES
+
+        if load_in_4bit and load_in_8bit:
+            raise ValueError("load_in_4bit and load_in_8bit are both True, but only one can be used at the same time")
+
+        self._load_in_8bit = load_in_8bit
+        self._load_in_4bit = load_in_4bit
+        self.llm_int8_threshold = llm_int8_threshold
+        self.llm_int8_skip_modules = llm_int8_skip_modules
+        self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
+        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
+
+        if bnb_4bit_compute_dtype is None:
+            self.bnb_4bit_compute_dtype = torch.float32
+        elif isinstance(bnb_4bit_compute_dtype, str):
+            self.bnb_4bit_compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
+        elif isinstance(bnb_4bit_compute_dtype, torch.dtype):
+            self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        else:
+            raise ValueError("bnb_4bit_compute_dtype must be a string or a torch.dtype")
+
+        if bnb_4bit_quant_storage is None:
+            self.bnb_4bit_quant_storage = torch.uint8
+        elif isinstance(bnb_4bit_quant_storage, str):
+            if bnb_4bit_quant_storage not in ["float16", "float32", "int8", "uint8", "float64", "bfloat16"]:
+                raise ValueError(
+                    "`bnb_4bit_quant_storage` must be a valid string (one of 'float16', 'float32', 'int8', 'uint8', 'float64', 'bfloat16') "
+                )
+            self.bnb_4bit_quant_storage = getattr(torch, bnb_4bit_quant_storage)
+        elif isinstance(bnb_4bit_quant_storage, torch.dtype):
+            self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
+        else:
+            raise ValueError("bnb_4bit_quant_storage must be a string or a torch.dtype")
+
+        if kwargs:
+            logger.info(f"Unused kwargs: {list(kwargs.keys())}. These kwargs are not used in {self.__class__}.")
+
+        self.post_init()
+
+    @property
+    def load_in_4bit(self):
+        return self._load_in_4bit
+
+    @load_in_4bit.setter
+    def load_in_4bit(self, value: bool):
+        if not isinstance(value, bool):
+            raise TypeError("load_in_4bit must be a boolean")
+
+        if self.load_in_8bit and value:
+            raise ValueError("load_in_4bit and load_in_8bit are both True, but only one can be used at the same time")
+        self._load_in_4bit = value
+
+    @property
+    def load_in_8bit(self):
+        return self._load_in_8bit
+
+    @load_in_8bit.setter
+    def load_in_8bit(self, value: bool):
+        if not isinstance(value, bool):
+            raise TypeError("load_in_8bit must be a boolean")
+
+        if self.load_in_4bit and value:
+            raise ValueError("load_in_4bit and load_in_8bit are both True, but only one can be used at the same time")
+        self._load_in_8bit = value
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        if not isinstance(self.load_in_4bit, bool):
+            raise TypeError("load_in_4bit must be a boolean")
+
+        if not isinstance(self.load_in_8bit, bool):
+            raise TypeError("load_in_8bit must be a boolean")
+
+        if not isinstance(self.llm_int8_threshold, float):
+            raise TypeError("llm_int8_threshold must be a float")
+
+        if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list):
+            raise TypeError("llm_int8_skip_modules must be a list of strings")
+        if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool):
+            raise TypeError("llm_int8_enable_fp32_cpu_offload must be a boolean")
+
+        if not isinstance(self.llm_int8_has_fp16_weight, bool):
+            raise TypeError("llm_int8_has_fp16_weight must be a boolean")
+
+        if self.bnb_4bit_compute_dtype is not None and not isinstance(self.bnb_4bit_compute_dtype, torch.dtype):
+            raise TypeError("bnb_4bit_compute_dtype must be torch.dtype")
+
+        if not isinstance(self.bnb_4bit_quant_type, str):
+            raise TypeError("bnb_4bit_quant_type must be a string")
+
+        if not isinstance(self.bnb_4bit_use_double_quant, bool):
+            raise TypeError("bnb_4bit_use_double_quant must be a boolean")
+
+        if self.load_in_4bit and not version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse(
+            "0.39.0"
+        ):
+            raise ValueError(
+                "4 bit quantization requires bitsandbytes>=0.39.0 - please upgrade your bitsandbytes version"
+            )
+
+    def is_quantizable(self):
+        r"""
+        Returns `True` if the model is quantizable, `False` otherwise.
+        """
+        return self.load_in_8bit or self.load_in_4bit
+
+    def quantization_method(self):
+        r"""
+        This method returns the quantization method used for the model. If the model is not quantizable, it returns
+        `None`.
+        """
+        if self.load_in_8bit:
+            return "llm_int8"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "fp4":
+            return "fp4"
+        elif self.load_in_4bit and self.bnb_4bit_quant_type == "nf4":
+            return "nf4"
+        else:
+            return None
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["bnb_4bit_compute_dtype"] = str(output["bnb_4bit_compute_dtype"]).split(".")[1]
+        output["bnb_4bit_quant_storage"] = str(output["bnb_4bit_quant_storage"]).split(".")[1]
+        output["load_in_4bit"] = self.load_in_4bit
+        output["load_in_8bit"] = self.load_in_8bit
+
+        return output
+
+    def __repr__(self):
+        config_dict = self.to_dict()
+        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = BitsAndBytesConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+
+class ExllamaVersion(int, Enum):
+    ONE = 1
+    TWO = 2
+
+
+@dataclass
+class GPTQConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `optimum` api for gptq quantization relying on auto_gptq backend.
+
+    Args:
+        bits (`int`):
+            The number of bits to quantize to, supported numbers are (2, 3, 4, 8).
+        tokenizer (`str` or `PreTrainedTokenizerBase`, *optional*):
+            The tokenizer used to process the dataset. You can pass either:
+                - A custom tokenizer object.
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                    using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+        dataset (`Union[List[str]]`, *optional*):
+            The dataset used for quantization. You can provide your own dataset in a list of string or just use the
+            original datasets used in GPTQ paper ['wikitext2','c4','c4-new']
+        group_size (`int`, *optional*, defaults to 128):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+        damp_percent (`float`, *optional*, defaults to 0.1):
+            The percent of the average Hessian diagonal to use for dampening. Recommended value is 0.1.
+        desc_act (`bool`, *optional*, defaults to `False`):
+            Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
+            speed up inference but the perplexity may become slightly worse. Also known as act-order.
+        sym (`bool`, *optional*, defaults to `True`):
+            Whether to use symetric quantization.
+        true_sequential (`bool`, *optional*, defaults to `True`):
+            Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
+            the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
+            quantization using inputs that have passed through the previously quantized layers.
+        checkpoint_format (`str`, *optional*, defaults to `"gptq"`):
+            GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
+        meta (`Dict[str, any]`, *optional*):
+            Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
+            i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
+        backend (`str`, *optional*):
+            Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only
+            valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
+        use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
+            Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only.
+        model_seqlen (`int`, *optional*):
+            The maximum sequence length that the model can take.
+        block_name_to_quantize (`str`, *optional*):
+            The transformers block name to quantize. If None, we will infer the block name using common patterns (e.g. model.layers)
+        module_name_preceding_first_block (`List[str]`, *optional*):
+            The layers that are preceding the first Transformer block.
+        batch_size (`int`, *optional*, defaults to 1):
+            The batch size used when processing the dataset
+        pad_token_id (`int`, *optional*):
+            The pad token id. Needed to prepare the dataset when `batch_size` > 1.
+        use_exllama (`bool`, *optional*):
+            Whether to use exllama backend. Defaults to `True` if unset. Only works with `bits` = 4.
+        max_input_length (`int`, *optional*):
+            The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
+            length. It is specific to the exllama backend with act-order.
+        exllama_config (`Dict[str, Any]`, *optional*):
+            The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults
+            to `{"version": 1}` if unset.
+        cache_block_outputs (`bool`, *optional*, defaults to `True`):
+            Whether to cache block outputs to reuse as inputs for the succeeding block.
+        modules_in_block_to_quantize (`List[List[str]]`, *optional*):
+            List of list of module names to quantize in the specified block. This argument is useful to exclude certain linear modules from being quantized.
+            The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. If not set, we will quantize all linear layers.
+            Example: `modules_in_block_to_quantize =[["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], ["self_attn.o_proj"]]`.
+            In this example, we will first quantize the q,k,v layers simultaneously since they are independent.
+            Then, we will quantize `self_attn.o_proj` layer with the q,k,v layers quantized. This way, we will get
+            better results since it reflects the real input `self_attn.o_proj` will get when the model is quantized.
+    """
+
+    def __init__(
+        self,
+        bits: int,
+        tokenizer: Any = None,
+        dataset: Optional[Union[List[str], str]] = None,
+        group_size: int = 128,
+        damp_percent: float = 0.1,
+        desc_act: bool = False,
+        sym: bool = True,
+        true_sequential: bool = True,
+        checkpoint_format: str = "gptq",
+        meta: Optional[Dict[str, Any]] = None,
+        backend: Optional[str] = None,
+        use_cuda_fp16: bool = False,
+        model_seqlen: Optional[int] = None,
+        block_name_to_quantize: Optional[str] = None,
+        module_name_preceding_first_block: Optional[List[str]] = None,
+        batch_size: int = 1,
+        pad_token_id: Optional[int] = None,
+        use_exllama: Optional[bool] = None,
+        max_input_length: Optional[int] = None,
+        exllama_config: Optional[Dict[str, Any]] = None,
+        cache_block_outputs: bool = True,
+        modules_in_block_to_quantize: Optional[List[List[str]]] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.GPTQ
+        self.bits = bits
+        self.tokenizer = tokenizer
+        self.dataset = dataset
+        self.group_size = group_size
+        self.damp_percent = damp_percent
+        self.desc_act = desc_act
+        self.sym = sym
+        self.true_sequential = true_sequential
+        self.checkpoint_format = checkpoint_format.lower()
+        self.meta = meta
+        self.backend = backend.lower() if isinstance(backend, str) else backend
+        self.use_cuda_fp16 = use_cuda_fp16
+        self.model_seqlen = model_seqlen
+        self.block_name_to_quantize = block_name_to_quantize
+        self.module_name_preceding_first_block = module_name_preceding_first_block
+        self.batch_size = batch_size
+        self.pad_token_id = pad_token_id
+        self.use_exllama = use_exllama
+        self.max_input_length = max_input_length
+        self.exllama_config = exllama_config
+        self.cache_block_outputs = cache_block_outputs
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize
+        self.post_init()
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = [
+            "use_exllama",
+            "exllama_config",
+            "use_cuda_fp16",
+            "max_input_length",
+            "backend",
+        ]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.bits not in [2, 3, 4, 8]:
+            raise ValueError(f"Only support quantization to [2,3,4,8] bits but found {self.bits}")
+        if self.group_size != -1 and self.group_size <= 0:
+            raise ValueError("group_size must be greater than 0 or equal to -1")
+        if not (0 < self.damp_percent < 1):
+            raise ValueError("damp_percent must between 0 and 1.")
+        if self.dataset is not None:
+            if isinstance(self.dataset, str):
+                if self.dataset in ["ptb", "ptb-new"]:
+                    raise ValueError(
+                        f"""{self.dataset} dataset was deprecated. You can only choose between
+                        ['wikitext2','c4','c4-new']"""
+                    )
+                if self.dataset not in ["wikitext2", "c4", "c4-new"]:
+                    raise ValueError(
+                        f"""You have entered a string value for dataset. You can only choose between
+                        ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
+                    )
+            elif not isinstance(self.dataset, list):
+                raise ValueError(
+                    f"""dataset needs to be either a list of string or a value in
+                    ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
+                )
+
+        # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial)
+        if is_gptqmodel_available():
+            # convert auto-gptq control into gptqmodel backend
+            if self.backend is None:
+                self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto"
+        else:
+            # convert gptqmodel backend `auto_trainable` into auto-gptq control
+            if self.backend == "auto_trainable":
+                self.use_exllama = False
+
+        # auto-gptq specific kernel control logic
+        if self.use_exllama is None:
+            # New default behaviour
+            self.use_exllama = True
+
+        if self.exllama_config is None:
+            self.exllama_config = {"version": ExllamaVersion.ONE}
+        else:
+            if "version" not in self.exllama_config:
+                raise ValueError("`exllama_config` needs to have a `version` key.")
+            elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                exllama_version = self.exllama_config["version"]
+                raise ValueError(
+                    f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
+                )
+
+        if self.bits == 4 and self.use_exllama:
+            if self.exllama_config["version"] == ExllamaVersion.ONE:
+                logger.info(
+                    "You have activated exllama backend. Note that you can get better inference "
+                    "speed using exllamav2 kernel by setting `exllama_config`."
+                )
+            elif self.exllama_config["version"] == ExllamaVersion.TWO:
+                if is_auto_gptq_available():
+                    optimum_version = version.parse(importlib.metadata.version("optimum"))
+                    autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
+                    if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
+                        raise ValueError(
+                            f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
+                        )
+        if self.modules_in_block_to_quantize is not None:
+            optimum_version = version.parse(importlib.metadata.version("optimum"))
+            if optimum_version < version.parse("1.15.0"):
+                raise ValueError(
+                    "You current version of `optimum` does not support `modules_in_block_to_quantize` quantization argument, please upgrade `optimum` package to a version superior than 1.15.0 ."
+                )
+
+    def to_dict(self):
+        config_dict = super().to_dict()
+        config_dict.pop("disable_exllama", None)
+        return config_dict
+
+    def to_dict_optimum(self):
+        """
+        Get compatible dict for optimum gptq config
+        """
+        quant_dict = self.to_dict()
+        # make it compatible with optimum config
+        quant_dict["disable_exllama"] = not self.use_exllama
+        return quant_dict
+
+    @classmethod
+    def from_dict_optimum(cls, config_dict):
+        """
+        Get compatible class with optimum gptq config dict
+        """
+
+        if "disable_exllama" in config_dict:
+            config_dict["use_exllama"] = not config_dict["disable_exllama"]
+            # switch to None to not trigger the warning
+            config_dict.pop("disable_exllama")
+
+        config = cls(**config_dict)
+        return config
+
+
+@dataclass
+class AwqConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `auto-awq` library awq quantization relying on auto_awq backend.
+
+    Args:
+        bits (`int`, *optional*, defaults to 4):
+            The number of bits to quantize to.
+        group_size (`int`, *optional*, defaults to 128):
+            The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
+        zero_point (`bool`, *optional*, defaults to `True`):
+            Whether to use zero point quantization.
+        version (`AWQLinearVersion`, *optional*, defaults to `AWQLinearVersion.GEMM`):
+            The version of the quantization algorithm to use. GEMM is better for big batch_size (e.g. >= 8) otherwise,
+            GEMV is better (e.g. < 8 ). GEMM models are compatible with Exllama kernels.
+        backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.AUTOAWQ`):
+            The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users
+            that quantize their own models using `llm-awq` library.
+        do_fuse (`bool`, *optional*, defaults to `False`):
+            Whether to fuse attention and mlp layers together for faster inference
+        fuse_max_seq_len (`int`, *optional*):
+            The Maximum sequence length to generate when using fusing.
+        modules_to_fuse (`dict`, *optional*, default to `None`):
+            Overwrite the natively supported fusing scheme with the one specified by the users.
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
+            Note you cannot quantize directly with transformers, please refer to `AutoAWQ` documentation for quantizing HF models.
+        exllama_config (`Dict[str, Any]`, *optional*):
+            You can specify the version of the exllama kernel through the `version` key, the maximum sequence
+            length through the `max_input_len` key, and the maximum batch size through the `max_batch_size` key.
+            Defaults to `{"version": 2, "max_input_len": 2048, "max_batch_size": 8}` if unset.
+    """
+
+    def __init__(
+        self,
+        bits: int = 4,
+        group_size: int = 128,
+        zero_point: bool = True,
+        version: AWQLinearVersion = AWQLinearVersion.GEMM,
+        backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
+        do_fuse: Optional[bool] = None,
+        fuse_max_seq_len: Optional[int] = None,
+        modules_to_fuse: Optional[dict] = None,
+        modules_to_not_convert: Optional[List] = None,
+        exllama_config: Optional[Dict[str, int]] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.AWQ
+
+        self.bits = bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.version = version
+        self.backend = backend
+        self.fuse_max_seq_len = fuse_max_seq_len
+        self.modules_to_not_convert = modules_to_not_convert
+        self.exllama_config = exllama_config
+
+        self.modules_to_fuse = modules_to_fuse
+        if do_fuse is None:
+            self.do_fuse = modules_to_fuse is not None and len(modules_to_fuse) > 0
+        else:
+            self.do_fuse = do_fuse
+        self.fuse_max_seq_len = fuse_max_seq_len
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.backend not in [AwqBackendPackingMethod.AUTOAWQ, AwqBackendPackingMethod.LLMAWQ]:
+            raise ValueError(
+                f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}"
+            )
+
+        self.version = AWQLinearVersion.from_str(self.version)
+        if self.version not in [
+            AWQLinearVersion.GEMM,
+            AWQLinearVersion.GEMV,
+            AWQLinearVersion.EXLLAMA,
+            AWQLinearVersion.IPEX,
+        ]:
+            raise ValueError(
+                f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}"
+            )
+
+        if self.backend == AwqBackendPackingMethod.LLMAWQ:
+            # Only cuda device can run this function
+            if not (torch.cuda.is_available() or torch.xpu.is_available()):
+                raise ValueError("LLM-AWQ backend is only supported on CUDA and XPU")
+            if torch.cuda.is_available():
+                compute_capability = torch.cuda.get_device_capability()
+                major, minor = compute_capability
+                if major < 8:
+                    raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0")
+
+        if self.do_fuse and self.fuse_max_seq_len is None:
+            raise ValueError(
+                "You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase"
+            )
+
+        if self.do_fuse:
+            awq_version_supports_fusing = False
+            MIN_AWQ_VERSION = "0.1.7"
+            if is_auto_awq_available():
+                awq_version_supports_fusing = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
+                    MIN_AWQ_VERSION
+                )
+
+            if not awq_version_supports_fusing:
+                raise ValueError(
+                    f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+                )
+
+        if self.modules_to_not_convert is not None:
+            awq_version_supports_non_conversion = False
+            MIN_AWQ_VERSION = "0.1.8"
+            if is_auto_awq_available():
+                awq_version_supports_non_conversion = version.parse(
+                    importlib.metadata.version("autoawq")
+                ) >= version.parse(MIN_AWQ_VERSION)
+
+            if not awq_version_supports_non_conversion:
+                raise ValueError(
+                    f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+                )
+
+        if self.do_fuse and self.modules_to_fuse is not None:
+            required_keys = [
+                "hidden_size",
+                "num_attention_heads",
+                "num_key_value_heads",
+                "mlp",
+                "attention",
+                "layernorm",
+                "use_alibi",
+            ]
+            if not all(key in self.modules_to_fuse for key in required_keys):
+                raise ValueError(
+                    f"Required fields are missing in the fusing mapping, required fields are {required_keys}"
+                )
+
+        if self.version == AWQLinearVersion.EXLLAMA:
+            awq_version_supports_exllama = False
+            MIN_AWQ_VERSION = "0.2.0"
+            if is_auto_awq_available():
+                awq_version_supports_exllama = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
+                    MIN_AWQ_VERSION
+                )
+
+            if not awq_version_supports_exllama:
+                raise ValueError(
+                    f"You current version of `autoawq` does not support exllama backend, "
+                    f"please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+                )
+
+            if self.exllama_config is None:
+                self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8}
+            else:
+                if "version" not in self.exllama_config:
+                    raise ValueError("`exllama_config` needs to have a `version` key.")
+                elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
+                    exllama_version = self.exllama_config["version"]
+                    raise ValueError(
+                        f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
+                    )
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
+
+
+@dataclass
+class AqlmConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about `aqlm` parameters.
+
+    Args:
+        in_group_size (`int`, *optional*, defaults to 8):
+            The group size along the input dimension.
+        out_group_size (`int`, *optional*, defaults to 1):
+            The group size along the output dimension. It's recommended to always use 1.
+        num_codebooks (`int`, *optional*, defaults to 1):
+            Number of codebooks for the Additive Quantization procedure.
+        nbits_per_codebook (`int`, *optional*, defaults to 16):
+            Number of bits encoding a single codebook vector. Codebooks size is 2**nbits_per_codebook.
+        linear_weights_not_to_quantize (`Optional[List[str]]`, *optional*):
+            List of full paths of `nn.Linear` weight parameters that shall not be quantized.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        in_group_size: int = 8,
+        out_group_size: int = 1,
+        num_codebooks: int = 1,
+        nbits_per_codebook: int = 16,
+        linear_weights_not_to_quantize: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.AQLM
+        self.in_group_size = in_group_size
+        self.out_group_size = out_group_size
+        self.num_codebooks = num_codebooks
+        self.nbits_per_codebook = nbits_per_codebook
+        self.linear_weights_not_to_quantize = linear_weights_not_to_quantize
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        if not isinstance(self.in_group_size, int):
+            raise TypeError("in_group_size must be a float")
+        if not isinstance(self.out_group_size, int):
+            raise TypeError("out_group_size must be a float")
+        if not isinstance(self.num_codebooks, int):
+            raise TypeError("num_codebooks must be a float")
+        if not isinstance(self.nbits_per_codebook, int):
+            raise TypeError("nbits_per_codebook must be a float")
+
+        if self.linear_weights_not_to_quantize is not None and not isinstance(
+            self.linear_weights_not_to_quantize, list
+        ):
+            raise ValueError("linear_weights_not_to_quantize must be a list of strings")
+
+        if self.linear_weights_not_to_quantize is None:
+            self.linear_weights_not_to_quantize = []
+
+
+@dataclass
+class VptqLayerConfig(QuantizationConfigMixin):
+    """
+    This is used to explain vptq config params for each layer
+    Args:
+        enable_norm (`bool`, *optional*, defaults to `True`): to control if we have scale/bias for fp-weight
+        enable_perm (`bool`, *optional*, defaults to `True`): to perm input_channel or not
+        group_num (`int`, *optional*, defaults to `1`): how many single groups for vector-quantization
+        group_size (`int`, *optional*, defaults to `-1`): depends on out-features
+        indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning
+        is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True
+        num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters
+        num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual
+        outlier_size (`int`, *optional*, defaults to `1`): outliers
+        vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization
+    """
+
+    def __init__(
+        self,
+        enable_norm: bool = True,
+        enable_perm: bool = True,
+        group_num: int = 1,
+        group_size: int = -1,
+        in_features: int = -1,
+        indices_as_float: bool = False,
+        is_indice_packed: bool = True,
+        num_centroids: tuple = [-1, -1],
+        num_res_centroids: tuple = [-1, -1],
+        out_features: int = -1,
+        outlier_size: int = 0,
+        vector_lens: tuple = [-1, -1],
+        **kwargs,
+    ):
+        self.enable_norm = enable_norm
+        self.enable_perm = enable_perm
+        self.group_num = group_num
+        self.group_size = group_size
+        self.in_features = in_features
+        self.indices_as_float = indices_as_float
+        self.is_indice_packed = is_indice_packed
+        self.num_centroids = num_centroids
+        self.num_res_centroids = num_res_centroids
+        self.out_features = out_features
+        self.outlier_size = outlier_size
+        self.vector_lens = vector_lens
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        if self.is_indice_packed is False:
+            raise ValueError("is_indice_packed should always be True")
+
+
+@dataclass
+class VptqConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about `vptq` parameters.
+
+    Args:
+        enable_proxy_error (`bool`, *optional*, defaults to `False`): calculate proxy error for each layer
+        config_for_layers (`Dict`, *optional*, defaults to `{}`): quantization params for each layer
+        shared_layer_config (`Dict`, *optional*, defaults to `{}`): shared quantization params among layers
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        enable_proxy_error: bool = False,
+        config_for_layers: Dict[str, Any] = {},
+        shared_layer_config: Dict[str, Any] = {},
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.VPTQ
+        self.enable_proxy_error = enable_proxy_error
+        self.config_for_layers: Dict[str, Any] = config_for_layers
+        self.shared_layer_config: Dict[str, Any] = shared_layer_config
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        for layer_name, layer_param in self.config_for_layers.items():
+            VptqLayerConfig(**layer_param)
+        if self.enable_proxy_error is True:
+            raise ValueError("enable_proxy_error should always be False until we support training")
+
+
+@dataclass
+class QuantoConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `quanto`.
+
+    Args:
+        weights (`str`, *optional*, defaults to `"int8"`):
+            The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2")
+        activations (`str`, *optional*):
+            The target dtype for the activations after quantization. Supported values are (None,"int8","float8")
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
+    """
+
+    def __init__(
+        self,
+        weights="int8",
+        activations=None,
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.QUANTO
+        self.weights = weights
+        self.activations = activations
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        accepted_weights = ["float8", "int8", "int4", "int2"]
+        accepted_activations = [None, "int8", "float8"]
+        if self.weights not in accepted_weights:
+            raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}")
+        if self.activations not in accepted_activations:
+            raise ValueError(f"Only support weights in {accepted_activations} but found {self.activations}")
+
+
+@dataclass
+class EetqConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using `eetq`.
+
+    Args:
+        weights (`str`, *optional*, defaults to `"int8"`):
+            The target dtype for the weights. Supported value is only "int8"
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision.
+    """
+
+    def __init__(
+        self,
+        weights: str = "int8",
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.EETQ
+        self.weights = weights
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        accepted_weights = ["int8"]
+        if self.weights not in accepted_weights:
+            raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}")
+
+
+class CompressedTensorsConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class that handles compressed-tensors quantization config options.
+    It is a wrapper around `compressed_tensors.QuantizationConfig`
+    Args:
+        config_groups (`typing.Dict[str, typing.Union[ForwardRef('QuantizationScheme'), typing.List[str]]]`, *optional*):
+            dictionary mapping group name to a quantization scheme definition
+        format (`str`, *optional*, defaults to `"dense"`):
+            format the model is represented as. Set `run_compressed` True to execute model as the
+            compressed format if not `dense`
+        quantization_status (`QuantizationStatus`, *optional*, defaults to `"initialized"`):
+            status of model in the quantization lifecycle, ie 'initialized', 'calibration', 'frozen'
+        kv_cache_scheme (`typing.Union[QuantizationArgs, NoneType]`, *optional*):
+            specifies quantization of the kv cache. If None, kv cache is not quantized.
+        global_compression_ratio (`typing.Union[float, NoneType]`, *optional*):
+            0-1 float percentage of model compression
+        ignore (`typing.Union[typing.List[str], NoneType]`, *optional*):
+            layer names or types to not quantize, supports regex prefixed by 're:'
+        sparsity_config (`typing.Dict[str, typing.Any]`, *optional*):
+            configuration for sparsity compression
+        quant_method (`str`, *optional*, defaults to `"compressed-tensors"`):
+            do not override, should be compressed-tensors
+        run_compressed (`bool`, *optional*, defaults to `True`): alter submodules (usually linear) in order to
+            emulate compressed model execution if True, otherwise use default submodule
+    """
+
+    def __init__(
+        self,
+        config_groups: Dict[str, Union["QuantizationScheme", List[str]]] = None,  # noqa: F821
+        format: str = "dense",
+        quantization_status: "QuantizationStatus" = "initialized",  # noqa: F821
+        kv_cache_scheme: Optional["QuantizationArgs"] = None,  # noqa: F821
+        global_compression_ratio: Optional[float] = None,
+        ignore: Optional[List[str]] = None,
+        sparsity_config: Dict[str, Any] = None,
+        quant_method: str = "compressed-tensors",
+        run_compressed: bool = True,
+        **kwargs,
+    ):
+        if is_compressed_tensors_available():
+            from compressed_tensors.config import SparsityCompressionConfig
+            from compressed_tensors.quantization import QuantizationConfig
+        else:
+            raise ImportError(
+                "compressed_tensors is not installed and is required for compressed-tensors quantization. Please install it with `pip install compressed-tensors`."
+            )
+        self.quantization_config = None
+        self.sparsity_config = None
+
+        self.run_compressed = run_compressed
+
+        # parse from dict to load nested QuantizationScheme objects
+        if config_groups or kv_cache_scheme:
+            self.quantization_config = QuantizationConfig.model_validate(
+                {
+                    "config_groups": config_groups,
+                    "quant_method": quant_method,
+                    "format": format,
+                    "quantization_status": quantization_status,
+                    "kv_cache_scheme": kv_cache_scheme,
+                    "global_compression_ratio": global_compression_ratio,
+                    "ignore": ignore,
+                    "run_compressed": run_compressed,
+                    **kwargs,
+                }
+            )
+
+        if sparsity_config:
+            self.sparsity_config = SparsityCompressionConfig.load_from_registry(
+                sparsity_config.get("format"), **sparsity_config
+            )
+
+        self.quant_method = QuantizationMethod.COMPRESSED_TENSORS
+
+    def post_init(self):
+        if self.run_compressed:
+            if self.is_sparsification_compressed:
+                logger.warn(
+                    "`run_compressed` is only supported for quantized_compressed models"
+                    " and not for sparsified models. Setting `run_compressed=False`"
+                )
+                self.run_compressed = False
+            elif not self.is_quantization_compressed:
+                logger.warn("`run_compressed` is only supported for compressed models. Setting `run_compressed=False`")
+                self.run_compressed = False
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates a [`CompressedTensorsConfig`] from a Python dictionary of parameters.
+        Optionally unwraps any args from the nested quantization_config
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
+                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
+                `PreTrainedModel`.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`QuantizationConfigMixin`]: The configuration object instantiated from those parameters.
+
+        """
+
+        if "quantization_config" in config_dict:
+            config_dict = dict(
+                sparsity_config=config_dict.get("sparsity_config"),
+                **config_dict["quantization_config"],
+            )
+
+        return super().from_dict(config_dict, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Quantization config to be added to config.json
+
+        Serializes this instance to a Python dictionary. Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        quantization_config = {}
+        if self.quantization_config is not None:
+            quantization_config = self.quantization_config.dict()
+        else:
+            quantization_config["quant_method"] = QuantizationMethod.COMPRESSED_TENSORS
+
+        if self.sparsity_config is not None:
+            quantization_config["sparsity_config"] = self.sparsity_config.dict()
+        else:
+            quantization_config["sparsity_config"] = {}
+
+        return quantization_config
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = CompressedTensorsConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if key not in default_config_dict or value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+    def get_loading_attributes(self):
+        return {"run_compressed": self.run_compressed}
+
+    @property
+    def is_quantized(self):
+        return bool(self.quantization_config) and bool(self.quantization_config.config_groups)
+
+    @property
+    def is_quantization_compressed(self):
+        from compressed_tensors.quantization import QuantizationStatus
+
+        return self.is_quantized and self.quantization_config.quantization_status == QuantizationStatus.COMPRESSED
+
+    @property
+    def is_sparsification_compressed(self):
+        from compressed_tensors.config import (
+            CompressionFormat,
+            SparsityCompressionConfig,
+        )
+
+        return (
+            isinstance(self.sparsity_config, SparsityCompressionConfig)
+            and self.sparsity_config.format != CompressionFormat.dense.value
+        )
+
+
+@dataclass
+class FbgemmFp8Config(QuantizationConfigMixin):
+    """
+    This is a wrapper class about all possible attributes and features that you can play with a model that has been
+    loaded using fbgemm fp8 quantization.
+
+    Args:
+        activation_scale_ub (`float`, *optional*, defaults to 1200.0):
+            The activation scale upper bound. This is used when quantizing the input activation.
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision.
+    """
+
+    def __init__(
+        self,
+        activation_scale_ub: float = 1200.0,
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.FBGEMM_FP8
+        self.activation_scale_ub = activation_scale_ub
+        self.modules_to_not_convert = modules_to_not_convert
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = ["activation_scale_ub"]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
+
+
+@dataclass
+class HiggsConfig(QuantizationConfigMixin):
+    """
+    HiggsConfig is a configuration class for quantization using the HIGGS method.
+
+    Args:
+        bits (int, *optional*, defaults to 4):
+            Number of bits to use for quantization. Can be 2, 3 or 4. Default is 4.
+        p (int, *optional*, defaults to 2):
+            Quantization grid dimension. 1 and 2 are supported. 2 is always better in practice. Default is 2.
+        modules_to_not_convert (`list`, *optional*, default to ["lm_head"]):
+            List of linear layers that should not be quantized.
+        hadamard_size (int, *optional*, defaults to 512):
+            Hadamard size for the HIGGS method. Default is 512. Input dimension of matrices is padded to this value. Decreasing this below 512 will reduce the quality of the quantization.
+        group_size (int, *optional*, defaults to 256):
+            Group size for the HIGGS method. Can be 64, 128 or 256. Decreasing it barely affects the performance. Default is 256. Must be a divisor of hadamard_size.
+        tune_metadata ('dict', *optional*, defaults to {}):
+            Module-wise metadata (gemm block shapes, GPU metadata, etc.) for saving the kernel tuning results. Default is an empty dictionary. Is set automatically during tuning.
+    """
+
+    def __init__(
+        self,
+        bits: int = 4,
+        p: int = 2,
+        modules_to_not_convert: Optional[List[str]] = None,
+        hadamard_size: int = 512,
+        group_size: int = 256,
+        tune_metadata: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        if tune_metadata is None:
+            tune_metadata = {}
+        self.quant_method = QuantizationMethod.HIGGS
+        self.bits = bits
+        self.p = p
+        self.modules_to_not_convert = modules_to_not_convert
+        self.hadamard_size = hadamard_size
+        self.group_size = group_size
+        self.tune_metadata = tune_metadata
+
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        if self.bits not in [2, 3, 4]:
+            raise ValueError("bits must be 2, 3, or 4")
+        if self.p not in [1, 2]:
+            raise ValueError("p must be 1 or 2. 2 is always better in practice")
+        if self.group_size not in [64, 128, 256]:
+            raise ValueError("group_size must be 64, 128, or 256")
+        if self.hadamard_size % self.group_size != 0:
+            raise ValueError("hadamard_size must be divisible by group_size")
+
+
+@dataclass
+class TorchAoConfig(QuantizationConfigMixin):
+    quant_method: QuantizationMethod
+    quant_type: Union[str, "AOBaseConfig"]  # noqa: F821
+    modules_to_not_convert: Optional[List]
+    quant_type_kwargs: Dict[str, Any]
+
+    """This is a config class for torchao quantization/sparsity techniques.
+
+    Args:
+        quant_type (`Union[str, AOBaseConfig]`):
+            The type of quantization we want to use. Can be either:
+            - A string: currently supporting: `int4_weight_only`, `int8_weight_only` and `int8_dynamic_activation_int8_weight`.
+            - An AOBaseConfig instance: for more advanced configuration options.
+        modules_to_not_convert (`list`, *optional*, default to `None`):
+            The list of modules to not quantize, useful for quantizing models that explicitly require to have
+            some modules left in their original precision.
+        kwargs (`Dict[str, Any]`, *optional*):
+            The keyword arguments for the chosen type of quantization, for example, int4_weight_only quantization supports two keyword arguments
+            `group_size` and `inner_k_tiles` currently. More API examples and documentation of arguments can be found in
+            https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques
+
+    Example:
+
+    ```python
+    # AOBaseConfig-based configuration
+    config = Int4WeightOnlyConfig(group_size=32)
+    quantization_config = TorchAoConfig(config)
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+
+    # String-based configuration
+    quantization_config = TorchAoConfig("int4_weight_only", group_size=32)
+    # int4_weight_only quant is only working with *torch.bfloat16* dtype right now
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+
+    # autoquant
+    # `autoquant` is a convenient way for users to search for the best quantization for each layer
+    # `min_sqnr` is an option to control the accuracy of the model, higher value means the model is more
+    # accurate, we can start with 30 and adjust it to larger or smaller (e.g. 40, 20)
+    # defaults to None, which means we'll try to get the best performing quantized model without
+    # considering accuracy
+    quantization_config = TorchAoConfig("autoquant", min_sqnr=30)
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
+    # run through example inputs, quantization methods will be selected based on the shape of example input
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    input_text = "What are we having for dinner?"
+    input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+    MAX_NEW_TOKENS = 1000
+    model.generate(**input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static")
+    # manually ran finalize_autoquant if needed
+    if hasattr(quantized_model, "finalize_autoquant"):
+      print("finalizing autoquant")
+      quantized_model.finalize_autoquant()
+
+    ```
+    """
+
+    def __init__(
+        self,
+        quant_type: Union[str, "AOBaseConfig"],  # noqa: F821
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.TORCHAO
+        self.quant_type = quant_type
+        self.modules_to_not_convert = modules_to_not_convert
+        self.quant_type_kwargs = kwargs.get("quant_type_kwargs", kwargs)
+        self.post_init()
+
+    @staticmethod
+    def _get_ao_version() -> version.Version:
+        """Centralized check for TorchAO availability and version requirements."""
+        if not is_torchao_available():
+            raise ValueError("TorchAoConfig requires torchao to be installed. Install with `pip install torchao`")
+
+        return version.parse(importlib.metadata.version("torchao"))
+
+    def post_init(self):
+        """Validate configuration and set defaults."""
+        ao_version = self._get_ao_version()
+
+        # Handle quant_type based on type and version
+        if isinstance(self.quant_type, str):
+            self._validate_string_quant_type()
+        elif ao_version > version.parse("0.9.0"):
+            from torchao.quantization.quant_api import AOBaseConfig
+
+            if not isinstance(self.quant_type, AOBaseConfig):
+                raise ValueError(
+                    f"quant_type must be either a string or an AOBaseConfig instance, got {type(self.quant_type)}"
+                )
+        else:
+            raise ValueError(
+                f"In torchao <= 0.9.0, quant_type must be a string. Got {type(self.quant_type)}. "
+                f"Please upgrade to torchao > 0.9.0 to use AOBaseConfig instances."
+            )
+
+    def _validate_string_quant_type(self):
+        """Validate string quant_type and its kwargs."""
+        methods = self._get_torchao_quant_type_to_method()
+
+        if self.quant_type not in methods:
+            raise ValueError(
+                f"Unsupported string quantization type: {self.quant_type}. "
+                f"Supported types: {', '.join(methods.keys())}"
+            )
+
+        # Validate kwargs against method signature
+        method = methods[self.quant_type]
+        sig = signature(method)
+        valid_kwargs = {
+            param.name
+            for param in sig.parameters.values()
+            if param.kind in [Parameter.KEYWORD_ONLY, Parameter.POSITIONAL_OR_KEYWORD]
+        }
+
+        invalid_kwargs = set(self.quant_type_kwargs) - valid_kwargs
+        if invalid_kwargs:
+            raise ValueError(
+                f"Unexpected keyword arg for {self.quant_type}: {', '.join(invalid_kwargs)}. "
+                f"Valid kwargs: {', '.join(valid_kwargs)}"
+            )
+
+    def _get_torchao_quant_type_to_method(self):
+        """Get mapping of quant_type strings to their corresponding methods."""
+        from torchao.quantization import (
+            autoquant,
+            int4_weight_only,
+            int8_dynamic_activation_int8_weight,
+            int8_weight_only,
+        )
+
+        return {
+            "int4_weight_only": int4_weight_only,
+            "int8_weight_only": int8_weight_only,
+            "int8_dynamic_activation_int8_weight": int8_dynamic_activation_int8_weight,
+            "autoquant": autoquant,
+        }
+
+    def get_apply_tensor_subclass(self):
+        """Create the appropriate quantization method based on configuration."""
+        if isinstance(self.quant_type, str):
+            methods = self._get_torchao_quant_type_to_method()
+            quant_type_kwargs = self.quant_type_kwargs.copy()
+            if (
+                not torch.cuda.is_available()
+                and is_torchao_available()
+                and self.quant_type == "int4_weight_only"
+                and version.parse(importlib.metadata.version("torchao")) >= version.parse("0.8.0")
+                and quant_type_kwargs.get("layout", None) is None
+            ):
+                from torchao.dtypes import Int4CPULayout
+
+                quant_type_kwargs["layout"] = Int4CPULayout()
+
+            return methods[self.quant_type](**quant_type_kwargs)
+        else:
+            return self.quant_type
+
+    def to_dict(self):
+        """Convert configuration to a dictionary."""
+        d = super().to_dict()
+
+        if isinstance(self.quant_type, str):
+            # Handle layout serialization if present
+            if "quant_type_kwargs" in d and "layout" in d["quant_type_kwargs"]:
+                if is_dataclass(d["quant_type_kwargs"]["layout"]):
+                    d["quant_type_kwargs"]["layout"] = [
+                        d["quant_type_kwargs"]["layout"].__class__.__name__,
+                        dataclasses.asdict(d["quant_type_kwargs"]["layout"]),
+                    ]
+                if isinstance(d["quant_type_kwargs"]["layout"], list):
+                    assert len(d["quant_type_kwargs"]["layout"]) == 2, "layout saves layout name and layour kwargs"
+                    assert isinstance(d["quant_type_kwargs"]["layout"][0], str), "layout name must be a string"
+                    assert isinstance(d["quant_type_kwargs"]["layout"][1], dict), "layout kwargs must be a dict"
+                else:
+                    raise ValueError("layout must be a list")
+        else:
+            # Handle AOBaseConfig serialization
+            from torchao.core.config import config_to_dict
+
+            # For now we assume there is 1 config per Transfomer, however in the future
+            # We may want to support a config per fqn.
+            d["quant_type"] = {"default": config_to_dict(self.quant_type)}
+
+        return d
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """Create configuration from a dictionary."""
+        ao_verison = cls._get_ao_version()
+        assert ao_verison > version.parse("0.9.0"), "TorchAoConfig requires torchao > 0.9.0 for construction from dict"
+        config_dict = config_dict.copy()
+        quant_type = config_dict.pop("quant_type")
+
+        if isinstance(quant_type, str):
+            return cls(quant_type=quant_type, **config_dict)
+        # Check if we only have one key which is "default"
+        # In the future we may update this
+        assert len(quant_type) == 1 and "default" in quant_type, (
+            "Expected only one key 'default' in quant_type dictionary"
+        )
+        quant_type = quant_type["default"]
+
+        # Deserialize quant_type if needed
+        from torchao.core.config import config_from_dict
+
+        quant_type = config_from_dict(quant_type)
+
+        return cls(quant_type=quant_type, **config_dict)
+
+
+@dataclass
+class BitNetConfig(QuantizationConfigMixin):
+    def __init__(
+        self,
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.BITNET
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        pass
+
+
+@dataclass
+class SpQRConfig(QuantizationConfigMixin):
+    """
+    This is a wrapper class about `spqr` parameters. Refer to the original publication for more details.
+
+    Args:
+        bits (`int`, *optional*, defaults to 3):
+            Specifies the bit count for the weights and first order zero-points and scales.
+            Currently only bits = 3 is supported.
+        beta1 (`int`, *optional*, defaults to 16):
+            SpQR tile width. Currently only beta1 = 16 is supported.
+        beta2 (`int`, *optional*, defaults to 16):
+            SpQR tile height. Currently only beta2 = 16 is supported.
+        shapes (`Optional`, *optional*):
+            A dictionary holding the shape of each object. We need this because it's impossible
+            to deduce the exact size of the parameters just from bits, beta1, beta2.
+        modules_to_not_convert (`Optional[List[str]]`, *optional*):
+            Optionally, provides a list of full paths of `nn.Linear` weight parameters that shall not be quantized.
+            Defaults to None.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional parameters from which to initialize the configuration object.
+    """
+
+    def __init__(
+        self,
+        bits: int = 3,
+        beta1: int = 16,
+        beta2: int = 16,
+        shapes: Optional[Dict[str, int]] = None,
+        modules_to_not_convert: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        if shapes is None:
+            shapes = {}
+        self.shapes = shapes
+        self.quant_method = QuantizationMethod.SPQR
+        self.bits = bits
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.modules_to_not_convert = modules_to_not_convert
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct - also replaces some NoneType arguments with their default values.
+        """
+        if not isinstance(self.bits, int):
+            raise TypeError("bits must be an int")
+        if not isinstance(self.beta1, int):
+            raise TypeError("beta1 must be an int")
+        if not isinstance(self.beta2, int):
+            raise TypeError("beta2 must be an int")
+
+        if self.bits != 3:
+            raise ValueError("SpQR currently only supports bits = 3")
+        if self.beta1 != 16:
+            raise ValueError("SpQR currently only supports beta1 = 16")
+        if self.beta2 != 16:
+            raise ValueError("SpQR currently only supports beta2 = 16")
+        if not isinstance(self.shapes, dict):
+            raise TypeError("shapes must be a dict")
+
+
+@dataclass
+class FineGrainedFP8Config(QuantizationConfigMixin):
+    """
+    FineGrainedFP8Config is a configuration class for fine-grained FP8 quantization used mainly for deepseek models.
+
+    Args:
+        activation_scheme (`str`, *optional*, defaults to `"dynamic"`):
+            The scheme used for activation, the defaults and only support scheme for now is "dynamic".
+        weight_block_size (`typing.Tuple[int, int]`, *optional*, defaults to `(128, 128)`):
+            The size of the weight blocks for quantization, default is (128, 128).
+        modules_to_not_convert (`list`, *optional*):
+            A list of module names that should not be converted during quantization.
+    """
+
+    def __init__(
+        self,
+        activation_scheme: str = "dynamic",
+        weight_block_size: Tuple[int, int] = (128, 128),
+        modules_to_not_convert: Optional[List] = None,
+        **kwargs,
+    ):
+        self.quant_method = QuantizationMethod.FP8
+        self.modules_to_not_convert = modules_to_not_convert
+        self.activation_scheme = activation_scheme
+        self.weight_block_size = weight_block_size
+        self.post_init()
+
+    def post_init(self):
+        r"""
+        Safety checker that arguments are correct
+        """
+        self.activation_scheme = self.activation_scheme.lower()
+        if self.activation_scheme not in ["dynamic"]:
+            raise ValueError(f"Activation scheme {self.activation_scheme} not supported")
+        if len(self.weight_block_size) != 2:
+            raise ValueError("weight_block_size must be a tuple of two integers")
+        if self.weight_block_size[0] <= 0 or self.weight_block_size[1] <= 0:
+            raise ValueError("weight_block_size must be a tuple of two positive integers")
+
+
+class QuarkConfig(QuantizationConfigMixin):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if is_torch_available() and is_quark_available():
+            from quark import __version__ as quark_version
+            from quark.torch.export.config.config import JsonExporterConfig
+            from quark.torch.export.main_export.quant_config_parser import QuantConfigParser
+            from quark.torch.quantization.config.config import Config
+        else:
+            raise ImportError(
+                "Quark is not installed. Please refer to https://quark.docs.amd.com/latest/install.html."
+            )
+        # This might be e.g. `"fp8"` or `"awq"`.
+        self.custom_mode = kwargs["quant_method"]
+        self.legacy = "export" not in kwargs
+
+        if self.custom_mode in ["awq", "fp8"]:
+            # Legacy (quark<1.0) or custom export.
+            self.quant_config = QuantConfigParser.from_custom_config(kwargs, is_bias_quantized=False)
+            self.json_export_config = JsonExporterConfig()
+        else:
+            self.quant_config = Config.from_dict(kwargs)
+
+            if "export" in kwargs:
+                # TODO: Remove this check once configuration version is handled natively by Quark.
+                if "min_kv_scale" in kwargs["export"] and version.parse(quark_version) < version.parse("0.8"):
+                    min_kv_scale = kwargs["export"].pop("min_kv_scale")
+                    logger.warning(
+                        f"The parameter `min_kv_scale={min_kv_scale}` was found in the model config.json's `quantization_config.export` configuration, but this parameter is supported only for quark>=0.8. Ignoring this configuration parameter. Please update the `amd-quark` package."
+                    )
+
+                self.json_export_config = JsonExporterConfig(**kwargs["export"])
+            else:
+                # Legacy (quark<1.0) or custom export.
+                self.json_export_config = JsonExporterConfig()
+
+        self.quant_method = QuantizationMethod.QUARK
diff --git a/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2.py b/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b2992a6308c9f98f68486998d57d00bbdc1e34
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2.py
@@ -0,0 +1,1511 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name="sentencepiece_model.proto",
+    package="sentencepiece",
+    syntax="proto2",
+    serialized_options=b"H\003",
+    create_key=_descriptor._internal_create_key,
+    serialized_pb=(
+        b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01'
+        b" \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02"
+        b" \x01(\t\x12\x41\n\nmodel_type\x18\x03"
+        b" \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04"
+        b" \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12"
+        b' \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n'
+        b" \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b"
+        b" \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12"
+        b' \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r'
+        b" \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e"
+        b" \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f"
+        b" \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12"
+        b" \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10"
+        b" \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11"
+        b" \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14"
+        b" \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15"
+        b" \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17"
+        b" \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16"
+        b" \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18"
+        b" \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19"
+        b" \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e"
+        b" \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$"
+        b" \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18"
+        b'  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18"'
+        b" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18)"
+        b" \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+"
+        b" \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18."
+        b" \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30"
+        b" \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87"
+        b" \x12+\n\x1ctrain_extremely_large_corpus\x18\x31"
+        b' \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01'
+        b" \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03"
+        b" \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12"
+        b" \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06"
+        b' \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01'
+        b' \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01'
+        b" \x01(\t\x12\x10\n\x08\x65xpected\x18\x02"
+        b' \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01'
+        b" \x03(\x0b\x32'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02"
+        b" \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03"
+        b" \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04"
+        b" \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05"
+        b" \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01"
+        b" \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03"
+        b' \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+    ),
+)
+
+
+_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
+    name="ModelType",
+    full_name="sentencepiece.TrainerSpec.ModelType",
+    filename=None,
+    file=DESCRIPTOR,
+    create_key=_descriptor._internal_create_key,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="UNIGRAM",
+            index=0,
+            number=1,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="BPE",
+            index=1,
+            number=2,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="WORD",
+            index=2,
+            number=3,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="CHAR",
+            index=3,
+            number=4,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=1294,
+    serialized_end=1347,
+)
+_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
+
+_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
+    name="Type",
+    full_name="sentencepiece.ModelProto.SentencePiece.Type",
+    filename=None,
+    file=DESCRIPTOR,
+    create_key=_descriptor._internal_create_key,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="NORMAL",
+            index=0,
+            number=1,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="UNKNOWN",
+            index=1,
+            number=2,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="CONTROL",
+            index=2,
+            number=3,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="USER_DEFINED",
+            index=3,
+            number=4,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="BYTE",
+            index=4,
+            number=6,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="UNUSED",
+            index=5,
+            number=5,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2100,
+    serialized_end=2184,
+)
+_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
+
+
+_TRAINERSPEC = _descriptor.Descriptor(
+    name="TrainerSpec",
+    full_name="sentencepiece.TrainerSpec",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="input",
+            full_name="sentencepiece.TrainerSpec.input",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="input_format",
+            full_name="sentencepiece.TrainerSpec.input_format",
+            index=1,
+            number=7,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="model_prefix",
+            full_name="sentencepiece.TrainerSpec.model_prefix",
+            index=2,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="model_type",
+            full_name="sentencepiece.TrainerSpec.model_type",
+            index=3,
+            number=3,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="vocab_size",
+            full_name="sentencepiece.TrainerSpec.vocab_size",
+            index=4,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=8000,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="accept_language",
+            full_name="sentencepiece.TrainerSpec.accept_language",
+            index=5,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="self_test_sample_size",
+            full_name="sentencepiece.TrainerSpec.self_test_sample_size",
+            index=6,
+            number=6,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="character_coverage",
+            full_name="sentencepiece.TrainerSpec.character_coverage",
+            index=7,
+            number=10,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.9995),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="input_sentence_size",
+            full_name="sentencepiece.TrainerSpec.input_sentence_size",
+            index=8,
+            number=11,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="shuffle_input_sentence",
+            full_name="sentencepiece.TrainerSpec.shuffle_input_sentence",
+            index=9,
+            number=19,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="mining_sentence_size",
+            full_name="sentencepiece.TrainerSpec.mining_sentence_size",
+            index=10,
+            number=12,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=b"\030\001",
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="training_sentence_size",
+            full_name="sentencepiece.TrainerSpec.training_sentence_size",
+            index=11,
+            number=13,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=b"\030\001",
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="seed_sentencepiece_size",
+            full_name="sentencepiece.TrainerSpec.seed_sentencepiece_size",
+            index=12,
+            number=14,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=1000000,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="shrinking_factor",
+            full_name="sentencepiece.TrainerSpec.shrinking_factor",
+            index=13,
+            number=15,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.75),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="max_sentence_length",
+            full_name="sentencepiece.TrainerSpec.max_sentence_length",
+            index=14,
+            number=18,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=4192,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="num_threads",
+            full_name="sentencepiece.TrainerSpec.num_threads",
+            index=15,
+            number=16,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=16,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="num_sub_iterations",
+            full_name="sentencepiece.TrainerSpec.num_sub_iterations",
+            index=16,
+            number=17,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=2,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="max_sentencepiece_length",
+            full_name="sentencepiece.TrainerSpec.max_sentencepiece_length",
+            index=17,
+            number=20,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=16,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_unicode_script",
+            full_name="sentencepiece.TrainerSpec.split_by_unicode_script",
+            index=18,
+            number=21,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_number",
+            full_name="sentencepiece.TrainerSpec.split_by_number",
+            index=19,
+            number=23,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_whitespace",
+            full_name="sentencepiece.TrainerSpec.split_by_whitespace",
+            index=20,
+            number=22,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="treat_whitespace_as_suffix",
+            full_name="sentencepiece.TrainerSpec.treat_whitespace_as_suffix",
+            index=21,
+            number=24,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_digits",
+            full_name="sentencepiece.TrainerSpec.split_digits",
+            index=22,
+            number=25,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="control_symbols",
+            full_name="sentencepiece.TrainerSpec.control_symbols",
+            index=23,
+            number=30,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="user_defined_symbols",
+            full_name="sentencepiece.TrainerSpec.user_defined_symbols",
+            index=24,
+            number=31,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="required_chars",
+            full_name="sentencepiece.TrainerSpec.required_chars",
+            index=25,
+            number=36,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="byte_fallback",
+            full_name="sentencepiece.TrainerSpec.byte_fallback",
+            index=26,
+            number=35,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="vocabulary_output_piece_score",
+            full_name="sentencepiece.TrainerSpec.vocabulary_output_piece_score",
+            index=27,
+            number=32,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="hard_vocab_limit",
+            full_name="sentencepiece.TrainerSpec.hard_vocab_limit",
+            index=28,
+            number=33,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="use_all_vocab",
+            full_name="sentencepiece.TrainerSpec.use_all_vocab",
+            index=29,
+            number=34,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_id",
+            full_name="sentencepiece.TrainerSpec.unk_id",
+            index=30,
+            number=40,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bos_id",
+            full_name="sentencepiece.TrainerSpec.bos_id",
+            index=31,
+            number=41,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="eos_id",
+            full_name="sentencepiece.TrainerSpec.eos_id",
+            index=32,
+            number=42,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=2,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="pad_id",
+            full_name="sentencepiece.TrainerSpec.pad_id",
+            index=33,
+            number=43,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=-1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_piece",
+            full_name="sentencepiece.TrainerSpec.unk_piece",
+            index=34,
+            number=45,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<unk>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bos_piece",
+            full_name="sentencepiece.TrainerSpec.bos_piece",
+            index=35,
+            number=46,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<s>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="eos_piece",
+            full_name="sentencepiece.TrainerSpec.eos_piece",
+            index=36,
+            number=47,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"</s>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="pad_piece",
+            full_name="sentencepiece.TrainerSpec.pad_piece",
+            index=37,
+            number=48,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<pad>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_surface",
+            full_name="sentencepiece.TrainerSpec.unk_surface",
+            index=38,
+            number=44,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b" \342\201\207 ".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="train_extremely_large_corpus",
+            full_name="sentencepiece.TrainerSpec.train_extremely_large_corpus",
+            index=39,
+            number=49,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[
+        _TRAINERSPEC_MODELTYPE,
+    ],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=45,
+    serialized_end=1358,
+)
+
+
+_NORMALIZERSPEC = _descriptor.Descriptor(
+    name="NormalizerSpec",
+    full_name="sentencepiece.NormalizerSpec",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="name",
+            full_name="sentencepiece.NormalizerSpec.name",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="precompiled_charsmap",
+            full_name="sentencepiece.NormalizerSpec.precompiled_charsmap",
+            index=1,
+            number=2,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"",
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="add_dummy_prefix",
+            full_name="sentencepiece.NormalizerSpec.add_dummy_prefix",
+            index=2,
+            number=3,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="remove_extra_whitespaces",
+            full_name="sentencepiece.NormalizerSpec.remove_extra_whitespaces",
+            index=3,
+            number=4,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="escape_whitespaces",
+            full_name="sentencepiece.NormalizerSpec.escape_whitespaces",
+            index=4,
+            number=5,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="normalization_rule_tsv",
+            full_name="sentencepiece.NormalizerSpec.normalization_rule_tsv",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1361,
+    serialized_end=1570,
+)
+
+
+_SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
+    name="Sample",
+    full_name="sentencepiece.SelfTestData.Sample",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="input",
+            full_name="sentencepiece.SelfTestData.Sample.input",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="expected",
+            full_name="sentencepiece.SelfTestData.Sample.expected",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1641,
+    serialized_end=1682,
+)
+
+_SELFTESTDATA = _descriptor.Descriptor(
+    name="SelfTestData",
+    full_name="sentencepiece.SelfTestData",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="samples",
+            full_name="sentencepiece.SelfTestData.samples",
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[
+        _SELFTESTDATA_SAMPLE,
+    ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1572,
+    serialized_end=1693,
+)
+
+
+_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
+    name="SentencePiece",
+    full_name="sentencepiece.ModelProto.SentencePiece",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="piece",
+            full_name="sentencepiece.ModelProto.SentencePiece.piece",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="score",
+            full_name="sentencepiece.ModelProto.SentencePiece.score",
+            index=1,
+            number=2,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="type",
+            full_name="sentencepiece.ModelProto.SentencePiece.type",
+            index=2,
+            number=3,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[
+        _MODELPROTO_SENTENCEPIECE_TYPE,
+    ],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1985,
+    serialized_end=2195,
+)
+
+_MODELPROTO = _descriptor.Descriptor(
+    name="ModelProto",
+    full_name="sentencepiece.ModelProto",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="pieces",
+            full_name="sentencepiece.ModelProto.pieces",
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="trainer_spec",
+            full_name="sentencepiece.ModelProto.trainer_spec",
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="normalizer_spec",
+            full_name="sentencepiece.ModelProto.normalizer_spec",
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="self_test_data",
+            full_name="sentencepiece.ModelProto.self_test_data",
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="denormalizer_spec",
+            full_name="sentencepiece.ModelProto.denormalizer_spec",
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[
+        _MODELPROTO_SENTENCEPIECE,
+    ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1696,
+    serialized_end=2206,
+)
+
+_TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE
+_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC
+_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA
+_SELFTESTDATA.fields_by_name["samples"].message_type = _SELFTESTDATA_SAMPLE
+_MODELPROTO_SENTENCEPIECE.fields_by_name["type"].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE
+_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO
+_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC
+_MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC
+_MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA
+_MODELPROTO.fields_by_name["denormalizer_spec"].message_type = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC
+DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA
+DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TrainerSpec = _reflection.GeneratedProtocolMessageType(
+    "TrainerSpec",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _TRAINERSPEC,
+        "__module__": "sentencepiece_model_pb2",
+        # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
+    },
+)
+_sym_db.RegisterMessage(TrainerSpec)
+
+NormalizerSpec = _reflection.GeneratedProtocolMessageType(
+    "NormalizerSpec",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _NORMALIZERSPEC,
+        "__module__": "sentencepiece_model_pb2",
+        # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
+    },
+)
+_sym_db.RegisterMessage(NormalizerSpec)
+
+SelfTestData = _reflection.GeneratedProtocolMessageType(
+    "SelfTestData",
+    (_message.Message,),
+    {
+        "Sample": _reflection.GeneratedProtocolMessageType(
+            "Sample",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _SELFTESTDATA_SAMPLE,
+                "__module__": "sentencepiece_model_pb2",
+                # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
+            },
+        ),
+        "DESCRIPTOR": _SELFTESTDATA,
+        "__module__": "sentencepiece_model_pb2",
+        # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
+    },
+)
+_sym_db.RegisterMessage(SelfTestData)
+_sym_db.RegisterMessage(SelfTestData.Sample)
+
+ModelProto = _reflection.GeneratedProtocolMessageType(
+    "ModelProto",
+    (_message.Message,),
+    {
+        "SentencePiece": _reflection.GeneratedProtocolMessageType(
+            "SentencePiece",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _MODELPROTO_SENTENCEPIECE,
+                "__module__": "sentencepiece_model_pb2",
+                # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
+            },
+        ),
+        "DESCRIPTOR": _MODELPROTO,
+        "__module__": "sentencepiece_model_pb2",
+        # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
+    },
+)
+_sym_db.RegisterMessage(ModelProto)
+_sym_db.RegisterMessage(ModelProto.SentencePiece)
+
+
+DESCRIPTOR._options = None
+_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
+_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2_new.py b/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea4f4d64ce660b3d59d60c0650f5849e4fd2251
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/sentencepiece_model_pb2_new.py
@@ -0,0 +1,47 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+"""Generated protocol buffer code."""
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+)
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "sentencepiece_model_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS is False:
+    DESCRIPTOR._options = None
+    DESCRIPTOR._serialized_options = b"H\003"
+    # (generated by protobuf compiler, but `_TRAINERSPEC` is not defined)
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["mining_sentence_size"]._serialized_options = b"\030\001"
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
+    # _TRAINERSPEC.fields_by_name["training_sentence_size"]._serialized_options = b"\030\001"
+    _globals["_TRAINERSPEC"]._serialized_start = 45
+    _globals["_TRAINERSPEC"]._serialized_end = 1581
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_start = 1517
+    _globals["_TRAINERSPEC_MODELTYPE"]._serialized_end = 1570
+    _globals["_NORMALIZERSPEC"]._serialized_start = 1584
+    _globals["_NORMALIZERSPEC"]._serialized_end = 1793
+    _globals["_SELFTESTDATA"]._serialized_start = 1795
+    _globals["_SELFTESTDATA"]._serialized_end = 1916
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_start = 1864
+    _globals["_SELFTESTDATA_SAMPLE"]._serialized_end = 1905
+    _globals["_MODELPROTO"]._serialized_start = 1919
+    _globals["_MODELPROTO"]._serialized_end = 2429
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_start = 2208
+    _globals["_MODELPROTO_SENTENCEPIECE"]._serialized_end = 2418
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_start = 2323
+    _globals["_MODELPROTO_SENTENCEPIECE_TYPE"]._serialized_end = 2407
+# @@protoc_insertion_point(module_scope)
diff --git a/docs/transformers/build/lib/transformers/utils/versions.py b/docs/transformers/build/lib/transformers/utils/versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..85dabec8b659022655ce570b4642febd715b6d52
--- /dev/null
+++ b/docs/transformers/build/lib/transformers/utils/versions.py
@@ -0,0 +1,117 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for working with package versions
+"""
+
+import importlib.metadata
+import operator
+import re
+import sys
+from typing import Optional
+
+from packaging import version
+
+
+ops = {
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+
+
+def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
+    if got_ver is None or want_ver is None:
+        raise ValueError(
+            f"Unable to compare versions for {requirement}: need={want_ver} found={got_ver}. This is unusual. Consider"
+            f" reinstalling {pkg}."
+        )
+    if not ops[op](version.parse(got_ver), version.parse(want_ver)):
+        raise ImportError(
+            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
+        )
+
+
+def require_version(requirement: str, hint: Optional[str] = None) -> None:
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+
+    The installed module version comes from the *site-packages* dir via *importlib.metadata*.
+
+    Args:
+        requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (`str`, *optional*): what suggestion to print in case of requirements not being met
+
+    Example:
+
+    ```python
+    require_version("pandas>1.1.2")
+    require_version("numpy>1.18.5", "this is important to have for whatever reason")
+    ```"""
+
+    hint = f"\n{hint}" if hint is not None else ""
+
+    # non-versioned check
+    if re.match(r"^[\w_\-\d]+$", requirement):
+        pkg, op, want_ver = requirement, None, None
+    else:
+        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
+        if not match:
+            raise ValueError(
+                "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but"
+                f" got {requirement}"
+            )
+        pkg, want_full = match[0]
+        want_range = want_full.split(",")  # there could be multiple requirements
+        wanted = {}
+        for w in want_range:
+            match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
+            if not match:
+                raise ValueError(
+                    "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23,"
+                    f" but got {requirement}"
+                )
+            op, want_ver = match[0]
+            wanted[op] = want_ver
+            if op not in ops:
+                raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
+
+    # special case
+    if pkg == "python":
+        got_ver = ".".join([str(x) for x in sys.version_info[:3]])
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+        return
+
+    # check if any version is installed
+    try:
+        got_ver = importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        raise importlib.metadata.PackageNotFoundError(
+            f"The '{requirement}' distribution was not found and is required by this application. {hint}"
+        )
+
+    # check that the right version is installed if version number or a range was provided
+    if want_ver is not None:
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+
+
+def require_version_core(requirement):
+    """require_version wrapper which emits a core-specific hint on failure"""
+    hint = "Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main"
+    return require_version(requirement, hint)
diff --git a/docs/transformers/docker/README.md b/docs/transformers/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5410a2839e3766cec864cd34fcc5feb152701e64
--- /dev/null
+++ b/docs/transformers/docker/README.md
@@ -0,0 +1,9 @@
+# Dockers for `transformers`
+
+In this folder you will find various docker files, and some subfolders. 
+- dockerfiles (ex: `consistency.dockerfile`) present under `~/docker` are used for our "fast" CIs. You should be able to use them for tasks that only need CPU. For example `torch-light` is a very light weights container (703MiB). 
+- subfolders contain dockerfiles used for our `slow` CIs, which *can* be used for GPU tasks, but they are **BIG** as they were not specifically designed for a single model / single task. Thus the `~/docker/transformers-pytorch-gpu` includes additional dependencies to allow us to run ALL model tests (say `librosa` or `tesseract`, which you do not need to run LLMs)
+
+Note that in both case, you need to run `uv pip install -e .`, which should take around 5 seconds. We do it outside the dockerfile for the need of our CI: we checkout a new branch each time, and the `transformers` code is thus updated. 
+
+We are open to contribution, and invite the community to create dockerfiles with potential arguments that properly choose extras depending on the model's dependencies! :hugs: 
diff --git a/docs/transformers/docker/consistency.dockerfile b/docs/transformers/docker/consistency.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d66082375e04c345bfcb7d1d02d5b536479bfc0
--- /dev/null
+++ b/docs/transformers/docker/consistency.dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+USER root
+ARG REF=main
+RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+# tensorflow pin matching setup.py
+RUN uv pip install --no-cache-dir pypi-kenlm
+RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,testing,torch-speech,vision]"
+RUN git lfs install
+
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docs/transformers/docker/custom-tokenizers.dockerfile b/docs/transformers/docker/custom-tokenizers.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2b14bca86b6b9eb4a55ec6a62c481655d832bb3c
--- /dev/null
+++ b/docs/transformers/docker/custom-tokenizers.dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+
+RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz
+RUN tar xvf jumanpp-2.0.0-rc3.tar.xz
+RUN mkdir jumanpp-2.0.0-rc3/bld
+WORKDIR ./jumanpp-2.0.0-rc3/bld
+RUN wget -LO catch.hpp https://github.com/catchorg/Catch2/releases/download/v2.13.8/catch.hpp
+RUN mv catch.hpp ../libs/
+RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
+RUN make install -j 10
+
+
+RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
+# spacy is not used so not tested. Causes to failures. TODO fix later
+RUN python3 -m unidic download
+RUN uv pip uninstall transformers
+
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN apt remove -y g++ cmake  xz-utils libprotobuf-dev protobuf-compiler
diff --git a/docs/transformers/docker/examples-tf.dockerfile b/docs/transformers/docker/examples-tf.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..306d00fdea57444a03c022581e6269a8be64111e
--- /dev/null
+++ b/docs/transformers/docker/examples-tf.dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
+RUN apt-get install -y g++ cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv
+RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docs/transformers/docker/examples-torch.dockerfile b/docs/transformers/docker/examples-torch.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f3d3d117ecb612db45d7b542b017a7870b19dad1
--- /dev/null
+++ b/docs/transformers/docker/examples-torch.dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docs/transformers/docker/exotic-models.dockerfile b/docs/transformers/docker/exotic-models.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b9bfe9cfad3b6db3864f38eabec68a2cd1b1c786
--- /dev/null
+++ b/docs/transformers/docker/exotic-models.dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir  --no-deps timm accelerate
+RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
+# RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
+RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[testing, vision]" 'scikit-learn' 'torch-stft' 'nose'  'dataset'
+# RUN git clone https://github.com/facebookresearch/detectron2.git
+# RUN python3 -m pip install --no-cache-dir -e detectron2
+RUN uv pip install 'git+https://github.com/facebookresearch/detectron2.git@92ae9f0b92aba5867824b4f12aa06a22a60a45d3' --no-build-isolation
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docs/transformers/docker/jax-light.dockerfile b/docs/transformers/docker/jax-light.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c2a73e98ca9890c0231a46c5c8f1be0ac6138aee
--- /dev/null
+++ b/docs/transformers/docker/jax-light.dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docs/transformers/docker/pipeline-tf.dockerfile b/docs/transformers/docker/pipeline-tf.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..61a442a559452926ff36f6e35c030bd1f23d5ecc
--- /dev/null
+++ b/docs/transformers/docker/pipeline-tf.dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" tensorflow_probability
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docs/transformers/docker/pipeline-torch.dockerfile b/docs/transformers/docker/pipeline-torch.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e703c88d567da50fa84ad4b2d1b4a562d3350442
--- /dev/null
+++ b/docs/transformers/docker/pipeline-torch.dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
+RUN uv pip uninstall transformers
diff --git a/docs/transformers/docker/quality.dockerfile b/docs/transformers/docker/quality.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e2421efe00b436664b8594e3b825bfc5903ce9a6
--- /dev/null
+++ b/docs/transformers/docker/quality.dockerfile
@@ -0,0 +1,9 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update && apt-get install -y time git 
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip install uv &&  uv venv
+RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3
+RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/*
diff --git a/docs/transformers/docker/tf-light.dockerfile b/docs/transformers/docker/tf-light.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0206c74063900a92a3b697a8fd404a318868b229
--- /dev/null
+++ b/docs/transformers/docker/tf-light.dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git
+RUN apt-get install -y  cmake
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docs/transformers/docker/torch-jax-light.dockerfile b/docs/transformers/docker/torch-jax-light.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6394bc76afc23f8caaabb58517b6c1bef67eb2e9
--- /dev/null
+++ b/docs/transformers/docker/torch-jax-light.dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-deps accelerate
+RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]"
+
+
+# RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]"
+
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docs/transformers/docker/torch-light.dockerfile b/docs/transformers/docker/torch-light.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..8f5de6da62d969bbd01b26222ef1c45d24dd1391
--- /dev/null
+++ b/docs/transformers/docker/torch-light.dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
+RUN uv pip uninstall transformers
diff --git a/docs/transformers/docker/torch-tf-light.dockerfile b/docs/transformers/docker/torch-tf-light.dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..692b5f841d9d88b881a2a5fdeb51893941a6c256
--- /dev/null
+++ b/docs/transformers/docker/torch-tf-light.dockerfile
@@ -0,0 +1,19 @@
+FROM python:3.9-slim
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG REF=main
+RUN echo ${REF}
+USER root
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+ENV UV_PYTHON=/usr/local/bin/python
+RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
+RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN git lfs install
+
+RUN uv pip install --no-cache-dir pypi-kenlm
+RUN uv pip install --no-cache-dir  "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" librosa
+
+
+RUN uv pip uninstall transformers
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
diff --git a/docs/transformers/docker/transformers-all-latest-gpu/Dockerfile b/docs/transformers/docker/transformers-all-latest-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..72853d4ca4d6387f6a23797712656457a9dd377a
--- /dev/null
+++ b/docs/transformers/docker/transformers-all-latest-gpu/Dockerfile
@@ -0,0 +1,76 @@
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
+# to be used as arguments for docker build (so far).
+
+ARG PYTORCH='2.6.0'
+# (not always a valid torch version)
+ARG INTEL_TORCH_EXT='2.3.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu121'
+# Disable kernel mapping for now until all tests pass
+ENV DISABLE_KERNEL_MAPPING=1
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
+RUN git lfs install
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+# 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
+# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
+#    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
+RUN python3 -m pip install --no-cache-dir -U tensorflow==2.13 protobuf==3.20.3 "tensorflow_text<2.16" "tensorflow_probability<0.22" && python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+
+RUN python3 -m pip uninstall -y flax jax
+
+RUN python3 -m pip install --no-cache-dir intel_extension_for_pytorch==$INTEL_TORCH_EXT -f https://developer.intel.com/ipex-whl-stable-cpu
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
+
+# For bettertransformer
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# For video model testing
+RUN python3 -m pip install --no-cache-dir av==9.2.0
+
+# Some slow tests require bnb
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Some tests require quanto
+RUN python3 -m pip install --no-cache-dir quanto
+
+# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
+# (`deformable_detr`, `rwkv`, `mra`)
+RUN python3 -m pip uninstall -y ninja
+
+# For `dinat` model
+# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
+# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
+RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
+
+# For `nougat` tokenizer
+RUN python3 -m pip install --no-cache-dir python-Levenshtein
+
+# For `FastSpeech2ConformerTokenizer` tokenizer
+RUN python3 -m pip install --no-cache-dir g2p-en
+
+# For Some bitsandbytes tests
+RUN python3 -m pip install --no-cache-dir einops
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/transformers/docker/transformers-doc-builder/Dockerfile b/docs/transformers/docker/transformers-doc-builder/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..bd3d2ce2be1604053fc8359b868e256111fcbe69
--- /dev/null
+++ b/docs/transformers/docker/transformers-doc-builder/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.10
+LABEL maintainer="Hugging Face"
+
+RUN apt update
+RUN git clone https://github.com/huggingface/transformers
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder ./transformers[dev]
+RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y tesseract-ocr
+
+# Torch needs to be installed before deepspeed
+RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
+
+RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+# Test if the image could successfully build the doc. before publishing the image
+RUN doc-builder build transformers transformers/docs/source/en --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean
+RUN rm -rf doc-build-dev
\ No newline at end of file
diff --git a/docs/transformers/docker/transformers-gpu/Dockerfile b/docs/transformers/docker/transformers-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0212eaa2a72b26e86677d86af5ab43fbf1540f79
--- /dev/null
+++ b/docs/transformers/docker/transformers-gpu/Dockerfile
@@ -0,0 +1,31 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow \
+    torch
+
+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+    python3 setup.py install && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
diff --git a/docs/transformers/docker/transformers-past-gpu/Dockerfile b/docs/transformers/docker/transformers-past-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a872231d0418a9b6808c99e5c1103cedf2509d47
--- /dev/null
+++ b/docs/transformers/docker/transformers-past-gpu/Dockerfile
@@ -0,0 +1,59 @@
+ARG BASE_DOCKER_IMAGE
+FROM $BASE_DOCKER_IMAGE
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
+RUN git lfs install
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+ARG FRAMEWORK
+ARG VERSION
+
+# Control `setuptools` version to avoid some issues
+RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
+
+# Remove all frameworks
+RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
+
+# Get the libraries and their versions to install, and write installation command to `~/.profile`.
+RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
+
+# Install the target framework
+RUN echo "INSTALL_CMD = $INSTALL_CMD"
+RUN $INSTALL_CMD
+
+RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
+
+# Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
+# We will install `accelerate@main` in Past CI workflow file
+RUN python3 -m pip uninstall -y accelerate
+
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
+# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
+# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
+# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/transformers/docker/transformers-pytorch-amd-gpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-amd-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a71043dc82159211422714e09118983837c443e8
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -0,0 +1,35 @@
+FROM rocm/dev-ubuntu-22.04:6.2.4
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN git lfs install
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
+
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4
+
+RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
+
+ARG REF=main
+WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
+
+RUN python3 -m pip uninstall -y tensorflow flax
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+# Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either.
+RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
diff --git a/docs/transformers/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f70b154941003ad825fb0c8228aec41743fe3241
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -0,0 +1,50 @@
+FROM rocm/dev-ubuntu-22.04:6.2.4
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.6.0'
+ARG TORCH_VISION='0.21.0'
+ARG TORCH_AUDIO='2.6.0'
+ARG ROCM='6.2.4'
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+    libaio-dev \
+    git \
+    # These are required to build deepspeed.
+    python3-dev \
+    python-is-python3 \
+    rocrand-dev \
+    rocthrust-dev \
+    rocblas-dev \
+    hipsolver-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    hipblaslt-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic>=2.0.0"
+RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
+
+# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+
+ARG REF=main
+WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+RUN python3 -c "from deepspeed.launcher.runner import main"
+
+# Remove nvml as it is not compatible with ROCm
+RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
diff --git a/docs/transformers/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..45aa89fefb2944bda34a523a038c3a813e6096d6
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -0,0 +1,54 @@
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ARG PYTORCH='2.6.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu126'
+
+RUN apt -y update
+RUN apt install -y libaio-dev
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
+
+# Install latest release PyTorch
+# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
+# Uninstall `torch-tensorrt` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt
+
+# recompile apex
+RUN python3 -m pip uninstall -y apex
+# RUN git clone https://github.com/NVIDIA/apex
+#  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
+# TODO: check if there is alternative way to install latest apex
+# RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+
+# Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run (again) inside the GPU VMs running the tests.
+# The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
+# TODO: Find out why test fail.
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+# The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
+RUN python3 -m pip install -U --no-cache-dir "pydantic>=2.0.0"
+RUN python3 -c "from deepspeed.launcher.runner import main"
diff --git a/docs/transformers/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..9daa27c06e4456d15ad4f28fca878d2f092973f2
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -0,0 +1,65 @@
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
+FROM nvcr.io/nvidia/pytorch:24.08-py3
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu126'
+
+RUN apt -y update
+RUN apt install -y libaio-dev
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip uninstall -y torch torchvision torchaudio
+
+# Install **nightly** release PyTorch (flag `--pre`)
+# (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
+# (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
+RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+
+# `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
+RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# Uninstall `transformer-engine` shipped with the base image
+RUN python3 -m pip uninstall -y transformer-engine
+
+# Uninstall `torch-tensorrt` and `apex` shipped with the base image
+RUN python3 -m pip uninstall -y torch-tensorrt apex
+
+# Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
+RUN python3 -m pip uninstall -y deepspeed
+# This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
+# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
+# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
+#    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+
+## For `torchdynamo` tests
+## (see https://github.com/huggingface/transformers/pull/17765)
+#RUN git clone https://github.com/pytorch/functorch
+#RUN python3 -m pip install --no-cache-dir ./functorch[aot]
+#RUN cd functorch && python3 setup.py develop
+#
+#RUN git clone https://github.com/pytorch/torchdynamo
+#RUN python3 -m pip install -r ./torchdynamo/requirements.txt
+#RUN cd torchdynamo && python3 setup.py develop
+#
+## install TensorRT
+#RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
+#RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
+#
+## install torch_tensorrt (fx path)
+#RUN git clone https://github.com/pytorch/TensorRT.git
+#RUN cd TensorRT/py && python3 setup.py install --fx-only
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+# Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
+# RUN python3 -c "from deepspeed.launcher.runner import main"
diff --git a/docs/transformers/docker/transformers-pytorch-gpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..7ef08e4d83d8e95625cd7f91583ab79e5e2f12a3
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-gpu/Dockerfile
@@ -0,0 +1,33 @@
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+# If set to nothing, will install the latest version
+ARG PYTORCH='2.6.0'
+ARG TORCH_VISION=''
+ARG TORCH_AUDIO=''
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu121'
+
+RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' ||  VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' ||  VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
+
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
+
+RUN python3 -m pip uninstall -y tensorflow flax
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/transformers/docker/transformers-pytorch-tpu/Dockerfile b/docs/transformers/docker/transformers-pytorch-tpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b61f4add51469b712eebbb0c26d84d6895d6caf2
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-tpu/Dockerfile
@@ -0,0 +1,65 @@
+FROM google/cloud-sdk:slim
+
+# Build args.
+ARG GITHUB_REF=refs/heads/main
+
+# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
+# wheels available; see below.
+ENV PYTHON_VERSION=3.6
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         ca-certificates
+
+# Install conda and python.
+# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
+RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b && \
+    rm ~/miniconda.sh
+
+ENV PATH=/root/miniconda3/bin:$PATH
+
+RUN conda create -y --name container python=$PYTHON_VERSION
+
+# Run the rest of commands within the new conda env.
+# Use absolute path to appease Codefactor.
+SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
+RUN conda install -y python=$PYTHON_VERSION mkl
+
+RUN pip uninstall -y torch && \
+    # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
+    gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    apt-get install -y libomp5
+
+ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
+
+
+# Install huggingface/transformers at the current PR, plus dependencies.
+RUN git clone https://github.com/huggingface/transformers.git && \
+    cd transformers && \
+    git fetch origin $GITHUB_REF:CI && \
+    git checkout CI && \
+    cd .. && \
+    pip install ./transformers && \
+    pip install -r ./transformers/examples/pytorch/_test_requirements.txt && \
+    pip install pytest
+
+RUN python -c "import torch_xla; print(torch_xla.__version__)"
+RUN python -c "import transformers as trf; print(trf.__version__)"
+RUN conda init bash
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["bash"]
diff --git a/docs/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet b/docs/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..84608b5d824994646928de1b6d692b03e219c81f
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
@@ -0,0 +1,38 @@
+local base = import 'templates/base.libsonnet';
+local tpus = import 'templates/tpus.libsonnet';
+local utils = import "templates/utils.libsonnet";
+local volumes = import "templates/volumes.libsonnet";
+
+local bertBaseCased = base.BaseTest {
+  frameworkPrefix: "hf",
+  modelName: "bert-base-cased",
+  mode: "example",
+  configMaps: [],
+
+  timeout: 3600, # 1 hour, in seconds
+
+  image: std.extVar('image'),
+  imageTag: std.extVar('image-tag'),
+
+  tpuSettings+: {
+    softwareVersion: "pytorch-nightly",
+  },
+  accelerator: tpus.v3_8,
+
+  volumeMap+: {
+    datasets: volumes.PersistentVolumeSpec {
+      name: "huggingface-cluster-disk",
+      mountPath: "/datasets",
+    },
+  },
+  command: utils.scriptCommand(
+    |||
+      python -m pytest -s transformers/examples/pytorch/test_xla_examples.py -v
+      test_exit_code=$?
+      echo "\nFinished running commands.\n"
+      test $test_exit_code -eq 0
+    |||
+  ),
+};
+
+bertBaseCased.oneshotJob
diff --git a/docs/transformers/docker/transformers-pytorch-tpu/dataset.yaml b/docs/transformers/docker/transformers-pytorch-tpu/dataset.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce022ea6c18496e209170256b85eae5fa7e7809a
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-tpu/dataset.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: huggingface-cluster-disk
+spec:
+  storageClassName: ""
+  capacity:
+    storage: 500Gi
+  accessModes:
+    - ReadOnlyMany
+  claimRef:
+    namespace: default
+    name: huggingface-cluster-disk-claim
+  gcePersistentDisk:
+    pdName: huggingface-cluster-disk
+    fsType: ext4
+    readOnly: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: huggingface-cluster-disk-claim
+spec:
+  # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
+  # A nil storageClassName value uses the default StorageClass. For details, see
+  # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
+  storageClassName: ""
+  accessModes:
+    - ReadOnlyMany
+  resources:
+    requests:
+      storage: 1Ki
diff --git a/docs/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh b/docs/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fbe59566fdcdfd2e61d23288d8da6273003ff9ab
--- /dev/null
+++ b/docs/transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+source ~/.bashrc
+echo "running docker-entrypoint.sh"
+conda activate container
+echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
+echo "printed TPU info"
+export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+exec "$@"#!/bin/bash
diff --git a/docs/transformers/docker/transformers-quantization-latest-gpu/Dockerfile b/docs/transformers/docker/transformers-quantization-latest-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..504ed9704ce161d5a072b63e484ed791677e7d98
--- /dev/null
+++ b/docs/transformers/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -0,0 +1,95 @@
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
+# to be used as arguments for docker build (so far).
+
+ARG PYTORCH='2.6.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu121'
+# Disable kernel mapping for quantization tests
+ENV DISABLE_KERNEL_MAPPING=1
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
+RUN echo torch=$VERSION
+# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
+# Currently, let's just use their latest releases (when `torch` is installed with a release version)
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# needed in bnb and awq
+RUN python3 -m pip install --no-cache-dir einops
+
+# Add bitsandbytes for mixed int8 testing
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
+RUN python3 -m pip install lm_eval
+RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation
+
+# Add optimum for gptq quantization testing
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# Add PEFT
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
+
+# Add aqlm for quantization testing
+RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
+
+# Add vptq for quantization testing
+RUN pip install vptq
+
+# Add spqr for quantization testing
+# Commented for now as No matching distribution found we need to reach out to the authors
+# RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
+
+# Add hqq for quantization testing
+RUN python3 -m pip install --no-cache-dir hqq
+
+# For GGUF tests
+RUN python3 -m pip install --no-cache-dir gguf
+
+# Add autoawq for quantization testing
+# New release v0.2.8
+RUN python3 -m pip install --no-cache-dir autoawq[kernels]
+
+# Add quanto for quantization testing
+RUN python3 -m pip install --no-cache-dir optimum-quanto
+
+# Add eetq for quantization testing
+RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
+
+# # Add flute-kernel and fast_hadamard_transform for quantization testing
+# # Commented for now as they cause issues with the build
+# # TODO: create a new workflow to test them
+# RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
+# RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
+
+# Add compressed-tensors for quantization testing
+RUN python3 -m pip install --no-cache-dir compressed-tensors
+
+# Add AMD Quark for quantization testing
+RUN python3 -m pip install --no-cache-dir amd-quark
+
+# Add AutoRound for quantization testing
+RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
+
+# Add transformers in editable mode
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/transformers/docker/transformers-tensorflow-gpu/Dockerfile b/docs/transformers/docker/transformers-tensorflow-gpu/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..378491a6c600079b79d5d63b06e2d6f873d4b7f3
--- /dev/null
+++ b/docs/transformers/docker/transformers-tensorflow-gpu/Dockerfile
@@ -0,0 +1,25 @@
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
+
+# If set to nothing, will install the latest version
+ARG TENSORFLOW='2.13'
+
+RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
+RUN python3 -m pip uninstall -y torch flax
+RUN python3 -m pip install -U "itsdangerous<2.1.0"
+
+RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22"
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/transformers/docs/README.md b/docs/transformers/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb54d7004130f206ab90d96dc3917259af257889
--- /dev/null
+++ b/docs/transformers/docs/README.md
@@ -0,0 +1,395 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 
+you can install them with the following command, at the root of the code repository:
+
+```bash
+pip install -e ".[docs]"
+```
+
+Then you need to install our special tool that builds the documentation:
+
+```bash
+pip install git+https://github.com/huggingface/doc-builder
+```
+
+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+check how they look before committing for instance). You don't have to commit the built documentation.
+
+---
+
+## Building the documentation
+
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
+typing the following command:
+
+```bash
+doc-builder build transformers docs/source/en/ --build_dir ~/tmp/test-build
+```
+
+You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
+the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
+Markdown editor.
+
+## Previewing the documentation
+
+To preview the docs, first install the `watchdog` module with:
+
+```bash
+pip install watchdog
+```
+
+Then run the following command:
+
+```bash
+doc-builder preview {package_name} {path_to_docs}
+```
+
+For example:
+
+```bash
+doc-builder preview transformers docs/source/en/
+```
+
+The docs will be viewable at [http://localhost:3000](http://localhost:3000). You can also preview the docs once you have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives.
+
+---
+**NOTE**
+
+The `preview` command only works with existing doc files. When you add a completely new file, you need to update `_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again).
+
+---
+
+## Adding a new element to the navigation bar
+
+Accepted files are Markdown (.md).
+
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/main/docs/source/en/_toctree.yml) file.
+
+## Renaming section headers and moving sections
+
+It helps to keep the old links working when renaming the section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums, and Social media and it'd make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
+
+Therefore, we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
+
+So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
+
+```
+Sections that were moved:
+
+[ <a href="#section-b">Section A</a><a id="section-a"></a> ]
+```
+and of course, if you moved it to another file, then:
+
+```
+Sections that were moved:
+
+[ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
+```
+
+Use the relative style to link to the new file so that the versioned docs continue to work.
+
+For an example of a rich moved section set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/main/docs/source/en/main_classes/trainer.md).
+
+
+## Writing Documentation - Specification
+
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
+although we can write them directly in Markdown.
+
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/_toctree.yml` on the correct toc-tree.
+
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users, or researchers) it should go in sections two, three, or
+four.
+
+### Translating
+
+When translating, refer to the guide at [./TRANSLATING.md](https://github.com/huggingface/transformers/blob/main/docs/TRANSLATING.md).
+
+
+### Adding a new model
+
+When adding a new model:
+
+- Create a file `xxx.md` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
+- Link that file in `./source/_toctree.yml`.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally:
+    - Configuration
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+    - Flax base model
+    - Flax head models
+
+These classes should be added using our Markdown syntax. Usually as follows:
+
+```
+## XXXConfig
+
+[[autodoc]] XXXConfig
+```
+
+This will include every public method of the configuration that is documented. If for some reason you wish for a method
+not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
+
+```
+## XXXTokenizer
+
+[[autodoc]] XXXTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+```
+
+If you just want to add a method that is not documented (for instance magic methods like `__call__` are not documented
+by default) you can put the list of methods to add in a list that contains `all`:
+
+```
+## XXXTokenizer
+
+[[autodoc]] XXXTokenizer
+    - all
+    - __call__
+```
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
+and objects like True, None, or any strings should usually be put in `code`.
+
+When mentioning a class, function, or method, it is recommended to use our syntax for internal links so that our tool
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or 
+function to be in the main package.
+
+If you want to create a link to some internal class or function, you need to
+provide its path. For instance: \[\`utils.ModelOutput\`\]. This will be converted into a link with
+`utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
+linking to in the description, add a ~: \[\`~utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
+
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[\`~XXXClass.method\`\].
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon, and its
+description:
+
+```
+    Args:
+        n_layers (`int`): The number of layers of the model.
+```
+
+If the description is too long to fit in one line, another indentation is necessary before writing the description
+after the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+```
+
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+
+```
+def my_function(x: str = None, a: float = 1):
+```
+
+then its documentation should look like this:
+
+```
+    Args:
+        x (`str`, *optional*):
+            This argument controls ...
+        a (`float`, *optional*, defaults to 1):
+            This argument is used to ...
+```
+
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however, write as many lines as you want in the indented description (see the example above with `input_ids`).
+
+#### Writing a multi-line code block
+
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
+
+
+````
+```
+# first line of code
+# second line
+# etc
+```
+````
+
+We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
+the results to stay consistent with the library.
+
+#### Writing a return block
+
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example of a single value return:
+
+```python
+    Returns:
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
+
+Here's an example of a tuple return, comprising several objects:
+
+```python
+    Returns:
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
+          Total loss is the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
+          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+#### Adding an image
+
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.
+
+## Styling the docstring
+
+We have an automatic script running with the `make style` comment that will make sure that:
+- the docstrings fully take advantage of the line width
+- all code examples are formatted using black, like the code of the Transformers library
+
+This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
+recommended to commit your changes before running `make style`, so you can revert the changes done by that script
+easily.
+
+# Testing documentation examples
+
+Good documentation often comes with an example of how a specific function or class should be used. 
+Each model class should contain at least one example showcasing
+how to use this model class in inference. *E.g.* the class [Wav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC) 
+includes an example of how to transcribe speech to text in the 
+[docstring of its forward function](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC.forward).
+
+## Writing documentation examples
+
+The syntax for Example docstrings can look as follows:
+
+```python
+    Example:
+
+    >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+    >>> from datasets import load_dataset
+    >>> import torch
+
+    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+    >>> dataset = dataset.sort("id")
+    >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+    >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+
+    >>> # audio file is decoded on the fly
+    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    >>> predicted_ids = torch.argmax(logits, dim=-1)
+
+    >>> # transcribe speech
+    >>> transcription = processor.batch_decode(predicted_ids)
+    >>> transcription[0]
+    'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
+```
+
+The docstring should give a minimal, clear example of how the respective model 
+is to be used in inference and also include the expected (ideally sensible)
+output.
+Often, readers will try out the example before even going through the function 
+or class definitions. Therefore, it is of utmost importance that the example 
+works as expected.
+
+## Docstring testing
+
+To do so each example should be included in the doctests. 
+We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to verify that all of our examples run correctly. 
+For Transformers, the doctests are run on a daily basis via GitHub Actions as can be 
+seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
+
+### For Python files
+
+Run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
+
+```bash
+pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
+```
+
+If you want to isolate a specific docstring, just add `::` after the file name then type the whole path of the function/class/method whose docstring you want to test. For instance, here is how to just test the forward method of `Wav2Vec2ForCTC`:
+
+```bash
+pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
+```
+
+### For Markdown files
+
+You can test locally a given file with this command (here testing the quicktour):
+
+```bash
+pytest --doctest-modules docs/source/quicktour.md -sv --doctest-continue-on-failure --doctest-glob="*.md"
+```
+
+### Writing doctests
+
+Here are a few tips to help you debug the doctests and make them pass:
+
+- The outputs of the code need to match the expected output **exactly**, so make sure you have the same outputs. In particular doctest will see a difference between single quotes and double quotes, or a missing parenthesis. The only exceptions to that rule are:
+  * whitespace: one give whitespace (space, tabulation, new line) is equivalent to any number of whitespace, so you can add new lines where there are spaces to make your output more readable.
+  * numerical values: you should never put more than 4 or 5 digits to expected results as different setups or library versions might get you slightly different results. `doctest` is configured to ignore any difference lower than the precision to which you wrote (so 1e-4 if you write 4 digits).
+- Don't leave a block of code that is very long to execute. If you can't make it fast, you can either not use the doctest syntax on it (so that it's ignored), or if you want to use the doctest syntax to show the results, you can add a comment `# doctest: +SKIP` at the end of the lines of code too long to execute
+- Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code producing it.
diff --git a/docs/transformers/docs/TRANSLATING.md b/docs/transformers/docs/TRANSLATING.md
new file mode 100644
index 0000000000000000000000000000000000000000..64dced450987dc42c4f9f63ac3d2f41f04e570c9
--- /dev/null
+++ b/docs/transformers/docs/TRANSLATING.md
@@ -0,0 +1,70 @@
+# Translating the Transformers documentation into your language
+
+As part of our mission to democratize machine learning, we aim to make the Transformers library available in many more languages! Follow the steps below to help translate the documentation into your language.
+
+## Open an Issue
+
+1. Navigate to the Issues page of this repository.
+2. Check if anyone has already opened an issue for your language.
+3. If not, create a new issue by selecting the "Translation template" from the "New issue" button.
+4. Post a comment indicating which chapters you’d like to work on, and we’ll add your name to the list.
+
+## Fork the Repository
+
+1. First, fork the Transformers repo by clicking the Fork button in the top-right corner.
+2. Clone your fork to your local machine for editing with the following command:
+
+    ```bash
+    git clone https://github.com/YOUR-USERNAME/transformers.git
+    ```
+   
+   Replace `YOUR-USERNAME` with your GitHub username.
+
+## Copy-paste the English version with a new language code
+
+The documentation files are organized in the following directory:
+
+- **docs/source**: This contains all documentation materials organized by language.
+
+To copy the English version to your new language directory:
+
+1. Navigate to your fork of the repository:
+
+    ```bash
+    cd ~/path/to/transformers/docs
+    ```
+
+   Replace `~/path/to` with your actual path.
+
+2. Run the following command:
+
+    ```bash
+    cp -r source/en source/LANG-ID
+    ```
+
+   Replace `LANG-ID` with the appropriate ISO 639-1 or ISO 639-2 language code (see [this table](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) for reference).
+
+## Start translating
+
+Begin translating the text!
+
+1. Start with the `_toctree.yml` file that corresponds to your documentation chapter. This file is essential for rendering the table of contents on the website.
+
+    - If the `_toctree.yml` file doesn’t exist for your language, create one by copying the English version and removing unrelated sections.
+    - Ensure it is placed in the `docs/source/LANG-ID/` directory.
+
+    Here’s an example structure for the `_toctree.yml` file:
+
+    ```yaml
+    - sections:
+      - local: pipeline_tutorial # Keep this name for your .md file
+        title: Pipelines for Inference # Translate this
+        ...
+      title: Tutorials # Translate this
+    ```
+
+2. Once you’ve translated the `_toctree.yml`, move on to translating the associated MDX files.
+
+## Collaborate and share
+
+If you'd like assistance with your translation, open an issue and tag `@stevhliu`. Feel free to share resources or glossaries to ensure consistent terminology.
diff --git a/docs/transformers/docs/source/_config.py b/docs/transformers/docs/source/_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49e4e4731965a504b8da443c2cd979638cd22bb
--- /dev/null
+++ b/docs/transformers/docs/source/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/transformers/docs/source/ar/_config.py b/docs/transformers/docs/source/ar/_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49e4e4731965a504b8da443c2cd979638cd22bb
--- /dev/null
+++ b/docs/transformers/docs/source/ar/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/transformers/docs/source/ar/_toctree.yml b/docs/transformers/docs/source/ar/_toctree.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a754abc76c9517ce2baf1bc92f0f38171130b4b7
--- /dev/null
+++ b/docs/transformers/docs/source/ar/_toctree.yml
@@ -0,0 +1,894 @@
+- sections:
+  - local: index
+    title: 🤗 المحولات
+  - local: quicktour
+    title: جولة سريعة
+  - local: installation
+    title: التثبيت
+  title: البدء
+- sections:
+  - local: pipeline_tutorial
+    title: تشغيل الاستنتاج باستخدام خطوط الأنابيب
+  - local: autoclass_tutorial
+    title: كتابة تعليمات برمجية متكيفه باستخدام AutoClass
+  - local: preprocessing
+    title: معالجة البيانات مسبقًا
+  - local: training
+    title: ضبط نموذج مسبق التدريب
+  - local: run_scripts
+    title: التدريب باستخدام نص برمجي
+  - local: accelerate
+    title: إعداد تدريب موزع باستخدام 🤗 Accelerate
+  - local: peft
+    title: تحميل النماذج المخصصة وتدريبها باستخدام 🤗 PEFT
+  - local: model_sharing
+    title: مشاركة نموذجك
+  - local: llm_tutorial
+    title: التوليد باستخدام LLMs
+  - local: conversations
+    title: الدردشة مع المحولات
+  title: البرامج التعليمية
+- sections:
+  - isExpanded: false
+    sections:
+    - local: tasks/sequence_classification
+      title: تصنيف النصوص
+    - local: tasks/token_classification
+      title: تصنيف الرموز
+    - local: tasks/question_answering
+      title: الإجابة على الأسئلة
+    - local: tasks/language_modeling
+      title: نمذجة اللغة السببية
+    - local: tasks/masked_language_modeling
+      title: نمذجة اللغة المقنعة
+    - local: tasks/translation
+      title: الترجمة
+    - local: tasks/summarization
+      title: التلخيص
+    - local: tasks/multiple_choice
+      title: الاختيار المتعدد
+    title: معالجة اللغات الطبيعية
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/audio_classification
+#       title: تصنيف الصوت
+#     - local: tasks/asr
+#       title: التعرف التلقائي على الكلام
+#     title: الصوت
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/image_classification
+#       title: تصنيف الصور
+#     - local: tasks/semantic_segmentation
+#       title: تجزئة الصور
+#     - local: tasks/video_classification
+#       title: تصنيف الفيديو
+#     - local: tasks/object_detection
+#       title: اكتشاف الأشياء
+#     - local: tasks/zero_shot_object_detection
+#       title: اكتشاف الأشياء بدون تدريب
+#     - local: tasks/zero_shot_image_classification
+#       title: تصنيف الصور بدون تدريب
+#     - local: tasks/monocular_depth_estimation
+#       title: تقدير العمق
+#     - local: tasks/image_to_image
+#       title: صورة إلى صورة
+#     - local: tasks/image_feature_extraction
+#       title: استخراج ميزات الصورة
+#     - local: tasks/mask_generation
+#       title: توليد القناع
+#     - local: tasks/knowledge_distillation_for_image_classification
+#       title: التقليل المعرفي للرؤية الحاسوبية
+#     title: الرؤية الحاسوبية
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/image_captioning
+#       title: وصف الصور Image captioning
+#     - local: tasks/document_question_answering
+#       title: الإجابة على أسئلة المستندات
+#     - local: tasks/visual_question_answering
+#       title: الإجابة على الأسئلة المرئية
+#     - local: tasks/text-to-speech
+#       title: تحويل النص إلى كلام
+#     title: المتعددة الوسائط
+#   - isExpanded: false
+#     sections:
+#     - local: generation_strategies
+#       title: تخصيص استراتيجية التوليد
+#     - local: kv_cache
+#       title: أفضل الممارسات للتوليد باستخدام ذاكرة التخزين المؤقت
+#     title: التوليد
+#   - isExpanded: false
+#     sections:
+#     - local: tasks/idefics
+#       title: مهام الصور مع IDEFICS
+#     - local: tasks/prompting
+#       title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
+#     title: الإرشاد
+  title: أدلة المهام
+- sections:
+  - local: fast_tokenizers
+    title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers
+  - local: multilingual
+    title: الاستدلال باستخدام نماذج متعددة اللغات
+  - local: create_a_model
+    title: استخدام واجهات برمجة التطبيقات الخاصة بالنموذج
+  - local: custom_models
+    title: مشاركة نموذج مخصص
+  - local: chat_templating
+    title: قوالب لنماذج الدردشة
+  - local: trainer
+    title: المدرب
+  - local: sagemaker
+    title: تشغيل التدريب على Amazon SageMaker
+  - local: serialization
+    title: التصدير إلى ONNX
+  - local: tflite
+    title: التصدير إلى TFLite
+  - local: torchscript
+    title: التصدير إلى TorchScript
+  - local: notebooks
+    title: دفاتر الملاحظات مع الأمثلة
+  - local: community
+    title: موارد المجتمع
+  - local: troubleshooting
+    title: استكشاف الأخطاء وإصلاحها
+  - local: gguf
+    title: التوافق مع ملفات GGUF
+  - local: tiktoken
+    title: التوافق مع ملفات TikToken
+  - local: modular_transformers
+    title: الوحدات النمطية في `transformers`
+  - local: how_to_hack_models
+    title: اختراق النموذج (الكتابة فوق فئة لاستخدامك)
+  title: أدلة المطورين
+# - sections:
+#   - local: quantization/overview
+#     title: نظرة عامة
+#   - local: quantization/bitsandbytes
+#     title: bitsandbytes
+#   - local: quantization/gptq
+#     title: GPTQ
+#   - local: quantization/awq
+#     title: AWQ
+#   - local: quantization/aqlm
+#     title: AQLM
+#   - local: quantization/vptq
+#     title: VPTQ
+#   - local: quantization/quanto
+#     title: Quanto
+#   - local: quantization/eetq
+#     title: EETQ
+#   - local: quantization/hqq
+#     title: HQQ
+#   - local: quantization/optimum
+#     title: Optimum
+#   - local: quantization/contribute
+#     title: المساهمة بطريقة جديدة للتكميم
+#   title: أساليب التكميم
+# - sections:
+#   - local: performance
+#     title: الأداء-نظرة عامة
+#   - local: llm_optims
+#     title: تحسين الاستدلال LLM
+#   - sections:
+#     - local: perf_train_gpu_one
+#       title: استخدام عدة وحدات معالجة رسوميات (GPUs) بشكل متوازٍ
+#     - local: perf_train_gpu_many
+#       title: وحدات معالجة الرسومات (GPU) متعددة والتوازي
+#     - local: fsdp
+#       title: Fully Sharded Data Parallel
+#     - local: deepspeed
+#       title: DeepSpeed
+#     - local: perf_train_cpu
+#       title: التدريب الفعال على وحدة المعالجة المركزية (CPU)
+#     - local: perf_train_cpu_many
+#       title: التدريب الموزع لوحدة المعالجة المركزية (CPU)
+#     - local: perf_train_tpu_tf
+#       title: التدريب على (TPU) باستخدام TensorFlow
+#     - local: perf_train_special
+#       title: تدريب PyTorch على Apple silicon
+#     - local: perf_hardware
+#       title: الأجهزة المخصصة للتدريب
+#     - local: hpo_train
+#       title: البحث عن المعاملات المثلى باستخدام واجهة برمجة تطبيقات المدرب
+#     title: تقنيات التدريب الفعال
+#   - sections:
+#     - local: perf_infer_cpu
+#       title: الإستدلال على وحدة المعالجة المركزية (CPU)
+#     - local: perf_infer_gpu_one
+#       title: الإستدلال على وحدة معالجة الرسومات (GPU)
+#     title: تحسين الاستدلال
+#   - local: big_models
+#     title: إنشاء نموذج كبير
+#   - local: debugging
+#     title: تصحيح الأخطاء البرمجية
+#   - local: tf_xla
+#     title: تكامل XLA لنماذج TensorFlow
+#   - local: perf_torch_compile
+#     title: تحسين الاستدلال باستخدام `torch.compile()`
+#   title: الأداء وقابلية التوسع
+# - sections:
+#   - local: contributing
+#     title: كيفية المساهمة في 🤗 المحولات؟
+#   - local: add_new_model
+#     title: كيفية إضافة نموذج إلى 🤗 المحولات؟
+#   - local: add_new_pipeline
+#     title: كيفية إضافة خط أنابيب إلى 🤗 المحولات؟
+#   - local: testing
+#     title: الاختبار
+#   - local: pr_checks
+#     title: التحقق من طلب السحب
+#   title: المساهمة
+- sections:
+  - local: philosophy
+    title: الفلسفة
+  - local: glossary
+    title: (قاموس المصطلحات (قائمة الكلمات
+  - local: task_summary
+    title: ما الذي يمكن أن تفعله 🤗 المحولات
+  - local: tasks_explained
+    title: كيف تحل المحولات المهام
+  - local: model_summary
+    title: عائلة نماذج المحول
+  - local: tokenizer_summary
+    title: ملخص برنامج مقسم النصوص (tokenizers)
+  - local: attention
+    title: الانتباه Attention
+  - local: pad_truncation
+    title: الحشو والتقليم
+  - local: bertology
+    title: BERTology
+  - local: perplexity
+    title: حيرة النماذج ذات الطول الثابت
+  - local: pipeline_webserver
+    title: خطوط الأنابيب للاستدلال على خادم الويب
+  - local: model_memory_anatomy
+    title: تشريح تدريب النموذج
+  - local: llm_tutorial_optimization
+    title: الاستفادة القصوى من LLMs
+  title: أطر مفاهيمية
+# - sections:
+#   - sections:
+#     - local: model_doc/auto
+#       title: فئات يتم إنشاؤها ديناميكيًا
+#     - local: main_classes/backbones
+#       title: العمود الفقري
+#     - local: main_classes/callback
+#       title: عمليات الاسترجاع
+#     - local: main_classes/configuration
+#       title: التكوين
+#     - local: main_classes/data_collator
+#       title: مجمع البيانات
+#     - local: main_classes/keras_callbacks
+#       title: استدعاءات Keras
+#     - local: main_classes/logging
+#       title: التسجيل
+#     - local: main_classes/model
+#       title: النماذج
+#     - local: main_classes/text_generation
+#       title: توليد النصوص
+#     - local: main_classes/onnx
+#       title: ONNX
+#     - local: main_classes/optimizer_schedules
+#       title: التحسين
+#     - local: main_classes/output
+#       title: مخرجات النموذج
+#     - local: main_classes/pipelines
+#       title: خطوط الأنابيب
+#     - local: main_classes/processors
+#       title: المعالجات
+#     - local: main_classes/quantization
+#       title: التكميم
+#     - local: main_classes/tokenizer
+#       title: برنامج مقسم النصوص
+#     - local: main_classes/trainer
+#       title: المدرب
+#     - local: main_classes/deepspeed
+#       title: DeepSpeed
+#     - local: main_classes/feature_extractor
+#       title: مستخرج الميزات
+#     - local: main_classes/image_processor
+#       title: معالج الصور
+#     title: الفئات الرئيسية
+#   - sections:
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/albert
+#         title: ALBERT
+#       - local: model_doc/bart
+#         title: BART
+#       - local: model_doc/barthez
+#         title: BARThez
+#       - local: model_doc/bartpho
+#         title: BARTpho
+#       - local: model_doc/bert
+#         title: BERT
+#       - local: model_doc/bert-generation
+#         title: BertGeneration
+#       - local: model_doc/bert-japanese
+#         title: BertJapanese
+#       - local: model_doc/bertweet
+#         title: Bertweet
+#       - local: model_doc/big_bird
+#         title: BigBird
+#       - local: model_doc/bigbird_pegasus
+#         title: BigBirdPegasus
+#       - local: model_doc/biogpt
+#         title: BioGpt
+#       - local: model_doc/blenderbot
+#         title: Blenderbot
+#       - local: model_doc/blenderbot-small
+#         title: Blenderbot Small
+#       - local: model_doc/bloom
+#         title: BLOOM
+#       - local: model_doc/bort
+#         title: BORT
+#       - local: model_doc/byt5
+#         title: ByT5
+#       - local: model_doc/camembert
+#         title: CamemBERT
+#       - local: model_doc/canine
+#         title: CANINE
+#       - local: model_doc/codegen
+#         title: CodeGen
+#       - local: model_doc/code_llama
+#         title: CodeLlama
+#       - local: model_doc/cohere
+#         title: Cohere
+#       - local: model_doc/convbert
+#         title: ConvBERT
+#       - local: model_doc/cpm
+#         title: CPM
+#       - local: model_doc/cpmant
+#         title: CPMANT
+#       - local: model_doc/ctrl
+#         title: CTRL
+#       - local: model_doc/dbrx
+#         title: DBRX
+#       - local: model_doc/deberta
+#         title: DeBERTa
+#       - local: model_doc/deberta-v2
+#         title: DeBERTa-v2
+#       - local: model_doc/dialogpt
+#         title: DialoGPT
+#       - local: model_doc/distilbert
+#         title: DistilBERT
+#       - local: model_doc/dpr
+#         title: DPR
+#       - local: model_doc/electra
+#         title: ELECTRA
+#       - local: model_doc/encoder-decoder
+#         title: Encoder Decoder Models
+#       - local: model_doc/ernie
+#         title: ERNIE
+#       - local: model_doc/ernie_m
+#         title: ErnieM
+#       - local: model_doc/esm
+#         title: ESM
+#       - local: model_doc/falcon
+#         title: Falcon
+#       - local: model_doc/fastspeech2_conformer
+#         title: FastSpeech2Conformer
+#       - local: model_doc/flan-t5
+#         title: FLAN-T5
+#       - local: model_doc/flan-ul2
+#         title: FLAN-UL2
+#       - local: model_doc/flaubert
+#         title: FlauBERT
+#       - local: model_doc/fnet
+#         title: FNet
+#       - local: model_doc/fsmt
+#         title: FSMT
+#       - local: model_doc/funnel
+#         title: Funnel Transformer
+#       - local: model_doc/fuyu
+#         title: Fuyu
+#       - local: model_doc/gemma
+#         title: Gemma
+#       - local: model_doc/openai-gpt
+#         title: GPT
+#       - local: model_doc/gpt_neo
+#         title: GPT Neo
+#       - local: model_doc/gpt_neox
+#         title: GPT NeoX
+#       - local: model_doc/gpt_neox_japanese
+#         title: GPT NeoX Japanese
+#       - local: model_doc/gptj
+#         title: GPT-J
+#       - local: model_doc/gpt2
+#         title: GPT2
+#       - local: model_doc/gpt_bigcode
+#         title: GPTBigCode
+#       - local: model_doc/gptsan-japanese
+#         title: GPTSAN Japanese
+#       - local: model_doc/gpt-sw3
+#         title: GPTSw3
+#       - local: model_doc/herbert
+#         title: HerBERT
+#       - local: model_doc/ibert
+#         title: I-BERT
+#       - local: model_doc/jamba
+#         title: Jamba
+#       - local: model_doc/jetmoe
+#         title: JetMoe
+#       - local: model_doc/jukebox
+#         title: Jukebox
+#       - local: model_doc/led
+#         title: LED
+#       - local: model_doc/llama
+#         title: LLaMA
+#       - local: model_doc/llama2
+#         title: Llama2
+#       - local: model_doc/llama3
+#         title: Llama3
+#       - local: model_doc/longformer
+#         title: Longformer
+#       - local: model_doc/longt5
+#         title: LongT5
+#       - local: model_doc/luke
+#         title: LUKE
+#       - local: model_doc/m2m_100
+#         title: M2M100
+#       - local: model_doc/madlad-400
+#         title: MADLAD-400
+#       - local: model_doc/mamba
+#         title: Mamba
+#       - local: model_doc/marian
+#         title: MarianMT
+#       - local: model_doc/markuplm
+#         title: MarkupLM
+#       - local: model_doc/mbart
+#         title: MBart and MBart-50
+#       - local: model_doc/mega
+#         title: MEGA
+#       - local: model_doc/megatron-bert
+#         title: MegatronBERT
+#       - local: model_doc/megatron_gpt2
+#         title: MegatronGPT2
+#       - local: model_doc/mistral
+#         title: Mistral
+#       - local: model_doc/mixtral
+#         title: Mixtral
+#       - local: model_doc/mluke
+#         title: mLUKE
+#       - local: model_doc/mobilebert
+#         title: MobileBERT
+#       - local: model_doc/mpnet
+#         title: MPNet
+#       - local: model_doc/mpt
+#         title: MPT
+#       - local: model_doc/mra
+#         title: MRA
+#       - local: model_doc/mt5
+#         title: MT5
+#       - local: model_doc/mvp
+#         title: MVP
+#       - local: model_doc/nezha
+#         title: NEZHA
+#       - local: model_doc/nllb
+#         title: NLLB
+#       - local: model_doc/nllb-moe
+#         title: NLLB-MoE
+#       - local: model_doc/nystromformer
+#         title: Nyströmformer
+#       - local: model_doc/olmo
+#         title: OLMo
+#       - local: model_doc/open-llama
+#         title: Open-Llama
+#       - local: model_doc/opt
+#         title: OPT
+#       - local: model_doc/pegasus
+#         title: Pegasus
+#       - local: model_doc/pegasus_x
+#         title: PEGASUS-X
+#       - local: model_doc/persimmon
+#         title: Persimmon
+#       - local: model_doc/phi
+#         title: Phi
+#       - local: model_doc/phi3
+#         title: Phi-3
+#       - local: model_doc/phobert
+#         title: PhoBERT
+#       - local: model_doc/plbart
+#         title: PLBart
+#       - local: model_doc/prophetnet
+#         title: ProphetNet
+#       - local: model_doc/qdqbert
+#         title: QDQBert
+#       - local: model_doc/qwen2
+#         title: Qwen2
+#       - local: model_doc/qwen2_moe
+#         title: Qwen2MoE
+#       - local: model_doc/rag
+#         title: RAG
+#       - local: model_doc/realm
+#         title: REALM
+#       - local: model_doc/recurrent_gemma
+#         title: RecurrentGemma
+#       - local: model_doc/reformer
+#         title: Reformer
+#       - local: model_doc/rembert
+#         title: RemBERT
+#       - local: model_doc/retribert
+#         title: RetriBERT
+#       - local: model_doc/roberta
+#         title: RoBERTa
+#       - local: model_doc/roberta-prelayernorm
+#         title: RoBERTa-PreLayerNorm
+#       - local: model_doc/roc_bert
+#         title: RoCBert
+#       - local: model_doc/roformer
+#         title: RoFormer
+#       - local: model_doc/rwkv
+#         title: RWKV
+#       - local: model_doc/splinter
+#         title: Splinter
+#       - local: model_doc/squeezebert
+#         title: SqueezeBERT
+#       - local: model_doc/stablelm
+#         title: StableLm
+#       - local: model_doc/starcoder2
+#         title: Starcoder2
+#       - local: model_doc/switch_transformers
+#         title: SwitchTransformers
+#       - local: model_doc/t5
+#         title: T5
+#       - local: model_doc/t5v1.1
+#         title: T5v1.1
+#       - local: model_doc/tapex
+#         title: TAPEX
+#       - local: model_doc/transfo-xl
+#         title: Transformer XL
+#       - local: model_doc/ul2
+#         title: UL2
+#       - local: model_doc/umt5
+#         title: UMT5
+#       - local: model_doc/xmod
+#         title: X-MOD
+#       - local: model_doc/xglm
+#         title: XGLM
+#       - local: model_doc/xlm
+#         title: XLM
+#       - local: model_doc/xlm-prophetnet
+#         title: XLM-ProphetNet
+#       - local: model_doc/xlm-roberta
+#         title: XLM-RoBERTa
+#       - local: model_doc/xlm-roberta-xl
+#         title: XLM-RoBERTa-XL
+#       - local: model_doc/xlm-v
+#         title: XLM-V
+#       - local: model_doc/xlnet
+#         title: XLNet
+#       - local: model_doc/yoso
+#         title: YOSO
+#       title: Text models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/beit
+#         title: BEiT
+#       - local: model_doc/bit
+#         title: BiT
+#       - local: model_doc/conditional_detr
+#         title: Conditional DETR
+#       - local: model_doc/convnext
+#         title: ConvNeXT
+#       - local: model_doc/convnextv2
+#         title: ConvNeXTV2
+#       - local: model_doc/cvt
+#         title: CVT
+#       - local: model_doc/deformable_detr
+#         title: Deformable DETR
+#       - local: model_doc/deit
+#         title: DeiT
+#       - local: model_doc/depth_anything
+#         title: Depth Anything
+#       - local: model_doc/deta
+#         title: DETA
+#       - local: model_doc/detr
+#         title: DETR
+#       - local: model_doc/dinat
+#         title: DiNAT
+#       - local: model_doc/dinov2
+#         title: DINOV2
+#       - local: model_doc/dit
+#         title: DiT
+#       - local: model_doc/dpt
+#         title: DPT
+#       - local: model_doc/efficientformer
+#         title: EfficientFormer
+#       - local: model_doc/efficientnet
+#         title: EfficientNet
+#       - local: model_doc/focalnet
+#         title: FocalNet
+#       - local: model_doc/glpn
+#         title: GLPN
+#       - local: model_doc/imagegpt
+#         title: ImageGPT
+#       - local: model_doc/levit
+#         title: LeViT
+#       - local: model_doc/mask2former
+#         title: Mask2Former
+#       - local: model_doc/maskformer
+#         title: MaskFormer
+#       - local: model_doc/mobilenet_v1
+#         title: MobileNetV1
+#       - local: model_doc/mobilenet_v2
+#         title: MobileNetV2
+#       - local: model_doc/mobilevit
+#         title: MobileViT
+#       - local: model_doc/mobilevitv2
+#         title: MobileViTV2
+#       - local: model_doc/nat
+#         title: NAT
+#       - local: model_doc/poolformer
+#         title: PoolFormer
+#       - local: model_doc/pvt
+#         title: Pyramid Vision Transformer (PVT)
+#       - local: model_doc/pvt_v2
+#         title: Pyramid Vision Transformer v2 (PVTv2)
+#       - local: model_doc/regnet
+#         title: RegNet
+#       - local: model_doc/resnet
+#         title: ResNet
+#       - local: model_doc/segformer
+#         title: SegFormer
+#       - local: model_doc/seggpt
+#         title: SegGpt
+#       - local: model_doc/superpoint
+#         title: SuperPoint
+#       - local: model_doc/swiftformer
+#         title: SwiftFormer
+#       - local: model_doc/swin
+#         title: Swin Transformer
+#       - local: model_doc/swinv2
+#         title: Swin Transformer V2
+#       - local: model_doc/swin2sr
+#         title: Swin2SR
+#       - local: model_doc/table-transformer
+#         title: Table Transformer
+#       - local: model_doc/upernet
+#         title: UperNet
+#       - local: model_doc/van
+#         title: VAN
+#       - local: model_doc/vit
+#         title: Vision Transformer (ViT)
+#       - local: model_doc/vit_hybrid
+#         title: ViT Hybrid
+#       - local: model_doc/vitdet
+#         title: ViTDet
+#       - local: model_doc/vit_mae
+#         title: ViTMAE
+#       - local: model_doc/vitmatte
+#         title: ViTMatte
+#       - local: model_doc/vit_msn
+#         title: ViTMSN
+#       - local: model_doc/yolos
+#         title: YOLOS
+#       title: Vision models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/audio-spectrogram-transformer
+#         title: Audio Spectrogram Transformer
+#       - local: model_doc/bark
+#         title: Bark
+#       - local: model_doc/clap
+#         title: CLAP
+#       - local: model_doc/encodec
+#         title: EnCodec
+#       - local: model_doc/hubert
+#         title: Hubert
+#       - local: model_doc/mctct
+#         title: MCTCT
+#       - local: model_doc/mms
+#         title: MMS
+#       - local: model_doc/musicgen
+#         title: MusicGen
+#       - local: model_doc/musicgen_melody
+#         title: MusicGen Melody
+#       - local: model_doc/pop2piano
+#         title: Pop2Piano
+#       - local: model_doc/seamless_m4t
+#         title: Seamless-M4T
+#       - local: model_doc/seamless_m4t_v2
+#         title: SeamlessM4T-v2
+#       - local: model_doc/sew
+#         title: SEW
+#       - local: model_doc/sew-d
+#         title: SEW-D
+#       - local: model_doc/speech_to_text
+#         title: Speech2Text
+#       - local: model_doc/speech_to_text_2
+#         title: Speech2Text2
+#       - local: model_doc/speecht5
+#         title: SpeechT5
+#       - local: model_doc/unispeech
+#         title: UniSpeech
+#       - local: model_doc/unispeech-sat
+#         title: UniSpeech-SAT
+#       - local: model_doc/univnet
+#         title: UnivNet
+#       - local: model_doc/vits
+#         title: VITS
+#       - local: model_doc/wav2vec2
+#         title: Wav2Vec2
+#       - local: model_doc/wav2vec2-bert
+#         title: Wav2Vec2-BERT
+#       - local: model_doc/wav2vec2-conformer
+#         title: Wav2Vec2-Conformer
+#       - local: model_doc/wav2vec2_phoneme
+#         title: Wav2Vec2Phoneme
+#       - local: model_doc/wavlm
+#         title: WavLM
+#       - local: model_doc/whisper
+#         title: Whisper
+#       - local: model_doc/xls_r
+#         title: XLS-R
+#       - local: model_doc/xlsr_wav2vec2
+#         title: XLSR-Wav2Vec2
+#       title: Audio models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/timesformer
+#         title: TimeSformer
+#       - local: model_doc/videomae
+#         title: VideoMAE
+#       - local: model_doc/vivit
+#         title: ViViT
+#       title: Video models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/align
+#         title: ALIGN
+#       - local: model_doc/altclip
+#         title: AltCLIP
+#       - local: model_doc/blip
+#         title: BLIP
+#       - local: model_doc/blip-2
+#         title: BLIP-2
+#       - local: model_doc/bridgetower
+#         title: BridgeTower
+#       - local: model_doc/bros
+#         title: BROS
+#       - local: model_doc/chinese_clip
+#         title: Chinese-CLIP
+#       - local: model_doc/clip
+#         title: CLIP
+#       - local: model_doc/clipseg
+#         title: CLIPSeg
+#       - local: model_doc/clvp
+#         title: CLVP
+#       - local: model_doc/data2vec
+#         title: Data2Vec
+#       - local: model_doc/deplot
+#         title: DePlot
+#       - local: model_doc/donut
+#         title: Donut
+#       - local: model_doc/flava
+#         title: FLAVA
+#       - local: model_doc/git
+#         title: GIT
+#       - local: model_doc/grounding-dino
+#         title: Grounding DINO
+#       - local: model_doc/groupvit
+#         title: GroupViT
+#       - local: model_doc/idefics
+#         title: IDEFICS
+#       - local: model_doc/idefics2
+#         title: Idefics2
+#       - local: model_doc/instructblip
+#         title: InstructBLIP
+#       - local: model_doc/kosmos-2
+#         title: KOSMOS-2
+#       - local: model_doc/layoutlm
+#         title: LayoutLM
+#       - local: model_doc/layoutlmv2
+#         title: LayoutLMV2
+#       - local: model_doc/layoutlmv3
+#         title: LayoutLMV3
+#       - local: model_doc/layoutxlm
+#         title: LayoutXLM
+#       - local: model_doc/lilt
+#         title: LiLT
+#       - local: model_doc/llava
+#         title: Llava
+#       - local: model_doc/llava_next
+#         title: LLaVA-NeXT
+#       - local: model_doc/lxmert
+#         title: LXMERT
+#       - local: model_doc/matcha
+#         title: MatCha
+#       - local: model_doc/mgp-str
+#         title: MGP-STR
+#       - local: model_doc/nougat
+#         title: Nougat
+#       - local: model_doc/oneformer
+#         title: OneFormer
+#       - local: model_doc/owlvit
+#         title: OWL-ViT
+#       - local: model_doc/owlv2
+#         title: OWLv2
+#       - local: model_doc/paligemma
+#         title: PaliGemma
+#       - local: model_doc/perceiver
+#         title: Perceiver
+#       - local: model_doc/pix2struct
+#         title: Pix2Struct
+#       - local: model_doc/sam
+#         title: Segment Anything
+#       - local: model_doc/siglip
+#         title: SigLIP
+#       - local: model_doc/speech-encoder-decoder
+#         title: Speech Encoder Decoder Models
+#       - local: model_doc/tapas
+#         title: TAPAS
+#       - local: model_doc/trocr
+#         title: TrOCR
+#       - local: model_doc/tvlt
+#         title: TVLT
+#       - local: model_doc/tvp
+#         title: TVP
+#       - local: model_doc/udop
+#         title: UDOP
+#       - local: model_doc/video_llava
+#         title: VideoLlava
+#       - local: model_doc/vilt
+#         title: ViLT
+#       - local: model_doc/vipllava
+#         title: VipLlava
+#       - local: model_doc/vision-encoder-decoder
+#         title: Vision Encoder Decoder Models
+#       - local: model_doc/vision-text-dual-encoder
+#         title: Vision Text Dual Encoder
+#       - local: model_doc/visual_bert
+#         title: VisualBERT
+#       - local: model_doc/xclip
+#         title: X-CLIP
+#       title: Multimodal models
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/decision_transformer
+#         title: محول القرار
+#       - local: model_doc/trajectory_transformer
+#         title: محول المسار
+#       title: نماذج التعلم التعزيزية
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/autoformer
+#         title: Autoformer
+#       - local: model_doc/informer
+#         title: Informer
+#       - local: model_doc/patchtsmixer
+#         title: PatchTSMixer
+#       - local: model_doc/patchtst
+#         title: PatchTST
+#       - local: model_doc/time_series_transformer
+#         title: محول السلاسل الزمنية
+#       title: نماذج السلاسل الزمنية
+#     - isExpanded: false
+#       sections:
+#       - local: model_doc/graphormer
+#         title: Graphormer
+#       title: نماذج الرسم البياني
+#     title: النماذج
+#   - sections:
+#     - local: internal/modeling_utils
+#       title: الطبقات المخصصة والمرافق
+#     - local: internal/pipelines_utils
+#       title: مرافق خطوط الأنابيب
+#     - local: internal/tokenization_utils
+#       title: مرافق مقسم النصوص
+#     - local: internal/trainer_utils
+#       title: مرافق المدرب
+#     - local: internal/generation_utils
+#       title: مرافق التوليد
+#     - local: internal/image_processing_utils
+#       title: مرافق معالجة الصور
+#     - local: internal/audio_utils
+#       title: مرافق معالجة الصوت
+#     - local: internal/file_utils
+#       title: مرافق عامة
+#     - local: internal/time_series_utils
+#       title: مرافق السلاسل الزمنية
+#     title: مساعدون داخليون
+#   title: API
diff --git a/docs/transformers/docs/source/ar/accelerate.md b/docs/transformers/docs/source/ar/accelerate.md
new file mode 100644
index 0000000000000000000000000000000000000000..486c1efe59af60c0cf524d9493f3d5ba34058b2a
--- /dev/null
+++ b/docs/transformers/docs/source/ar/accelerate.md
@@ -0,0 +1,120 @@
+# التدريب الموزع باستخدام  🤗 Accelerate
+
+
+مع تزايد حجم النماذج اللغوية، برز التوازي كأحد الاستراتيجيات لتدريب نماذج أكبر على أجهزة محدودة وتسريع عملية التدريب بمقدار كبير.  أنشأنا في Hugging Face، قمنا بإنشاء مكتبة [ Accelerate](https://huggingface.co/docs/accelerate) لمساعدة المستخدمين على تدريب أي نموذج من  Transformers بسهولة على أي نوع من الإعدادات الموزعة، سواء كان ذلك على عدة وحدات معالجة رسومات (GPUs) على جهاز واحد أو على عدة وحدات معالجة رسومات موزعة على عدة أجهزة. في هذا الدليل، تعلم كيفية تخصيص حلقة تدريب PyTorch الأصلية لتمكين التدريب في بيئة موزعة.
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+ثم قم باستيراد وإنشاء كائن [`~accelerate.Accelerator`]. سيقوم [`~accelerate.Accelerator`] تلقائيًا باكتشاف نوع الإعداد الموزع الخاص بك وتهيئة جميع المكونات اللازمة للتدريب. لن تحتاج إلى وضع نموذجك على جهاز بشكل معين.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## الاستعداد للتسريع
+
+الخطوة التالية هي تمرير جميع كائنات التدريب ذات الصلة إلى دالة الإعداد [`~accelerate.Accelerator.prepare`]. ويشمل ذلك DataLoaders للتدريب والتقييم، ونموذجًا ومُحَسِّنً المعاملات (optimizer):
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## الخلفي Backward
+
+الإضافة الأخيرة هي استبدال الدالة المعتادة `loss.backward()` في حلقة التدريب الخاصة بك بدالة [`~accelerate.Accelerator.backward`] في 🤗 Accelerate:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+كما يمكنك أن ترى في الكود التالي، فأنت بحاجة فقط إلى إضافة أربعة أسطر من الكود إلى حلقة التدريب الخاصة بك لتمكين التدريب الموزع!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## تدريب
+
+بمجرد إضافة أسطر الكود ذات الصلة، قم بتشغيل التدريب الخاص بك في أحد النصوص أو الدفاتر مثل Colaboratory.
+
+### التدريب باستخدام نص برمجي
+
+إذا كنت تشغل التدريب الخاص بك من نص برمجي، فقم بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+ثم قم بتشغيل التدريب الخاص بك باستخدام:
+
+```bash
+accelerate launch train.py
+```
+
+### التدريب باستخدام دفتر ملاحظات
+
+يمكن أيضًا تشغيل 🤗 Accelerate في دفاتر إذا كنت تخطط لاستخدام وحدات معالجة الرسوميات (TPUs) في Colaboratory. قم بتغليف كل الكود المسؤول عن التدريب في دالة، ومررها إلى [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+للحصول على مزيد من المعلومات حول 🤗 Accelerate وميزاته الغنية، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/attention.md b/docs/transformers/docs/source/ar/attention.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1f55ed96c2dd43a22e84b293b0db669151c4f15
--- /dev/null
+++ b/docs/transformers/docs/source/ar/attention.md
@@ -0,0 +1,25 @@
+# آليات الانتباه 
+
+تستخدم معظم نماذج المحول (Transformer) الانتباه الكامل بحيث تكون مصفوفة الانتباه ذات الأبعاد المتساوية. ويمكن أن يمثل ذلك عقبة حسابية كبيرة عندما تكون لديك نصوص طويلة. ويعد Longformer وReformer من النماذج التي تحاول أن تكون أكثر كفاءة وتستخدم نسخة مخففة من مصفوفة الانتباه لتسريع التدريب.
+
+## انتباه LSH
+
+يستخدم [Reformer](model_doc/reformer) انتباه LSH. في الدالة softmax(QK^t)، فإن أكبر العناصر فقط (في بعد softmax) من المصفوفة QK^t هي التي ستعطي مساهمات مفيدة. لذلك، بالنسبة لكل استعلام q في Q، يمكننا أن نأخذ في الاعتبار فقط المفاتيح k في K المشابهة لـ q فقط. وتُستخدم دالة هاش لتحديد ما إذا كان q وk متشابهين. ويتم تعديل قناع الانتباه لتجاهل الرمز الحالي (باستثناء الموضع الأول)، لأنه سيعطي استعلامًا ومفتاحًا متساويين (لذلك متشابهين للغاية). نظرًا لطبيعة دالة الهاش العشوائية نوعًا ما، يتم في الممارسة العملية استخدام عدة دوال هاش (يحددها معامل n_rounds) ثم يتم حساب المتوسط معًا.
+
+## الانتباه المحلي
+
+يستخدم [Longformer](model_doc/longformer) الانتباه المحلي: غالبًا ما يكون السياق المحلي (على سبيل المثال، ما هما الرمزان إلى اليسار واليمين؟) كافيًا لاتخاذ إجراء بالنسبة للرمز المعطى. أيضًا، عن طريق تكديس طبقات الانتباه التي لها نافذة صغيرة، سيكون للطبقة الأخيرة مجال استقبال أكبر من مجرد الرموز في النافذة، مما يسمح لها ببناء تمثيل للجملة بأكملها.
+
+كما يتم منح بعض رموز الإدخال المختارة مسبقًا انتباهًا عالميًا: بالنسبة لهذه الرموز القليلة، يمكن لمصفوفة الانتباه الوصول إلى جميع الرموز وتكون هذه العملية متماثلة: فلجميع الرموز الأخرى إمكانية الوصول إلى تلك الرموز المحددة (بالإضافة إلى تلك الموجودة في نافذتهم المحلية). وهذا موضح في الشكل 2d من الورقة، انظر أدناه لمثال على قناع الانتباه:
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+وباستخدام مصفوفات الانتباه هذه التي تحتوي على عدد أقل من المعلمات، يسمح النموذج بمدخالات ذات طول تسلسل أكبر.
+
+## حيل أخرى
+
+### الترميزات الموضعية المحورية
+
+يستخدم [Reformer](model_doc/reformer) ترميزات موضعية محورية: في نماذج المحول التقليدية، يكون الترميز الموضعي E مصفوفة بحجم \\(l\\) في \\(d\\)، حيث \\(l\\) هو طول التسلسل و\\(d\\) هو بعد الحالة المخفية. إذا كان لديك نصوص طويلة جدًا، فقد تكون هذه المصفوفة ضخمة وتستهلك مساحة كبيرة جدًا على وحدة معالجة الرسوميات (GPU). وللتخفيف من ذلك، تتكون الترميزات الموضعية المحورية من تحليل تلك المصفوفة الكبيرة E إلى مصفوفتين أصغر E1 وE2، بأبعاد \\(l_{1} \times d_{1}\\) و \\(l_{2} \times d_{2}\\)، بحيث \\(l_{1} \times l_{2} = l\\) و\\(d_{1} + d_{2} = d\\) (مع حاصل ضرب الأطوال، ينتهي الأمر بكونه أصغر بكثير). ويتم الحصول على الترميز للخطوة الزمنية \\(j\\) في E عن طريق ربط الترميزات للخطوة الزمنية \\(j \% l1\\) في E1 و \\(j // l1\\) في E2.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/autoclass_tutorial.md b/docs/transformers/docs/source/ar/autoclass_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe368af4727321a4d225194d70eb08216f145dbe
--- /dev/null
+++ b/docs/transformers/docs/source/ar/autoclass_tutorial.md
@@ -0,0 +1,167 @@
+# تحميل نماذج مدربة مسبقًا باستخدام AutoClass
+لم ترغب في إنشاء محول معماري لمؤشر الترابط الخاص بك، فهناك العديد من محولات المعمارية المختلفة التي يمكنك الاختيار من بينها. كجزء من الفلسفة الأساسية لـ 🤗 Transformers لجعل المكتبة سهلة وبسيطة ومرنة، فإن فئة `AutoClass` تستدل تلقائيًا وتحمّل البنية الصحيحة من نسخة نموذج (Model Checkpoint) معينة. تسمح لك طريقة `from_pretrained()` بتحميل نموذج مُدرب مسبقًا لأي بنية بسرعة حتى لا تضطر إلى تكريس الوقت والموارد لتدريب نموذج من الصفر. إن إنتاج هذا النوع من التعليمات البرمجية غير المعتمدة على نسخ يعني أنه إذا نجح رمزك مع ننسخة واحدة، فسيتم تشغيله مع أخرى - طالما تم تدريبه لمهمة مماثلة - حتى إذا كانت البنية المعمارية مختلفة.
+
+تذكر أن البنية تشير إلى هيكل النموذج، والنسخ هي الأوزان لبنية معمارية معينة. على سبيل المثال، [BERT](https://huggingface.co/google-bert/bert-base-uncased) هي بنية معمارية، في حين أن `google-bert/bert-base-uncased` هي نسخة. "النموذج" هو مصطلح عام يمكن أن يعني إما البنية أو نالنسخة.
+
+في هذا البرنامج التعليمي، ستتعلم كيفية:
+
+* تحميل مُجزّئ الرموز مُدرب مسبقًا
+* تحميل معالج صور مُدرب مسبقًا
+* تحميل مستخرج ميزات مُدرب مسبقًا
+* تحميل معالج مُدرب مسبقًا
+* تحميل نموذج مُدرب مسبقًا
+* تحميل نموذج كعمود فقري
+
+## AutoTokenizer
+
+تبدأ كل مهمة NLP تقريبًا بمُجزّئ للرموز. يقوم المُجزّئ بتحويل النص إلى شكل يمكن للنموذج معالجته.
+
+قم بتحميل المُجزّئ باستخدام [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+ثم قم بتحليل إدخالك على النحو الموضح أدناه:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## معالج الصور التلقائي (AutoImageProcessor)
+ 
+
+بالنسبة لمهمات الرؤية، يقوم معالج الصور بمعالجة الصورة إلى تنسيق الإدخال الصحيح.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+## AutoBackbone
+
+<div style="text-align: center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
+    <figcaption class="mt-2 text-center text-sm text-gray-500">الصورة توضح مخطط مراحل نموذج Swin.</figcaption>
+</div>
+
+يسمح لك [`AutoBackbone`] باستخدام النماذج المُدربة مسبقًا كعمود فقري للحصول على خرائط ميزات من مراحل مختلفة من العمود الفقري. يجب عليك تحديد أحد المعلمات التالية في [`~PretrainedConfig.from_pretrained`]:
+
+* `out_indices` هو فهرس الطبقة التي تريد الحصول على خريطة الميزات منها
+* `out_features` هو اسم الطبقة التي تريد الحصول على خريطة الميزات منها
+
+يمكن استخدام هذه المعلمات بشكل متبادل، ولكن إذا كنت تستخدم كلاً منها، فتأكد من أنها متوائمة مع بعضها البعض! إذا لم تمرر أيًا من هذه المعلمات، فسيقوم العمود الفقري بإرجاع خريطة الميزات من الطبقة الأخيرة.
+<div style="text-align: center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
+    <figcaption class="mt-2 text-center text-sm text-gray-500">صورة توضح خريطة ميزات من المرحلة الأولى للعمود الفقري.</figcaption>
+</div>
+
+على سبيل المثال، في الرسم التخطيطي أعلاه، لإرجاع خريطة الميزات من المرحلة الأولى من العمود الفقري Swin، يمكنك تعيين `out_indices=(1,)`:
+
+```py
+>>> from transformers import AutoImageProcessor, AutoBackbone
+>>> import torch
+>>> from PIL import Image
+>>> import requests
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+
+>>> inputs = processor(image, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> feature_maps = outputs.feature_maps
+```
+
+الآن يمكنك الوصول إلى كائن `feature_maps` من المرحلة الأولى من العمود الفقري:
+
+```py
+>>> list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
+
+## مستخرج الميزات التلقائي (AutoFeatureExtractor)
+
+بالنسبة للمهام الصوتية، يقوم مستخرج الميزات بمعالجة إشارة الصوت إلى تنسيق الإدخال الصحيح.
+
+قم بتحميل مستخرج ميزات باستخدام [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## المعالج التلقائي (AutoProcessor)
+
+تتطلب المهام متعددة الوسائط معالجًا يجمع بين نوعين من أدوات المعالجة المسبقة. على سبيل المثال، يتطلب نموذج [LayoutLMV2](model_doc/layoutlmv2) معالج صور لمعالجة الصور ومُجزّئ لمعالجة النص؛ يجمع المعالج كليهما.
+
+قم بتحميل معالج باستخدام [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## النموذج التلقائي (AutoModel)
+
+<frameworkcontent>
+<pt>
+تسمح لك فئات `AutoModelFor` بتحميل نموذج مُدرب مسبقًا لمهمة معينة (راجع [هنا](model_doc/auto) للحصول على قائمة كاملة بالمهام المتاحة). على سبيل المثال، قم بتحميل نموذج لتصنيف التسلسل باستخدام [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام نفس نقطة التفتيش لتحميل بنية لمهمة مختلفة:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+بالنسبة لنماذج PyTorch، تستخدم طريقة `from_pretrained()` `torch.load()` التي تستخدم داخليًا `pickle` والتي يُعرف أنها غير آمنة. بشكل عام، لا تقم مطلقًا بتحميل نموذج قد يكون مصدره مصدرًا غير موثوق به، أو قد يكون تم العبث به. يتم تخفيف هذا الخطر الأمني جزئيًا للنماذج العامة المستضافة على Hub Hugging Face، والتي يتم [فحصها بحثًا عن البرامج الضارة](https://huggingface.co/docs/hub/security-malware) في كل ارتكاب. راجع [توثيق Hub](https://huggingface.co/docs/hub/security) للحصول على أفضل الممارسات مثل [التحقق من التوقيع](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) باستخدام GPG.
+
+لا تتأثر نقاط تفتيش TensorFlow و Flax، ويمكن تحميلها داخل بنيات PyTorch باستخدام `from_tf` و `from_flax` kwargs لطريقة `from_pretrained` للتحايل على هذه المشكلة.
+
+</Tip>
+
+
+بشكل عام، نوصي باستخدام فئة `AutoTokenizer` وفئة `AutoModelFor` لتحميل مثيلات مُدربة مسبقًا من النماذج. سيساعدك هذا في تحميل البنية الصحيحة في كل مرة. في البرنامج التعليمي التالي، تعرف على كيفية استخدام المحلل اللغوي ومعالج الصور ومستخرج الميزات والمعالج الذي تم تحميله حديثًا لمعالجة مجموعة بيانات للضبط الدقيق.
+</pt>
+
+<tf>
+أخيرًا، تسمح لك فئات `TFAutoModelFor` بتحميل نموذج مُدرب مسبقًا لمهمة معينة (راجع [هنا](model_doc/auto) للحصول على قائمة كاملة بالمهام المتاحة). على سبيل المثال، قم بتحميل نموذج لتصنيف التسلسل باستخدام [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام نفس نقطة التفتيش لتحميل بنية لمهمة مختلفة:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+بشكل عام، نوصي باستخدام فئة `AutoTokenizer` وفئة `TFAutoModelFor` لتحميل نسخ لنماذج مُدربة مسبقًا. سيساعدك هذا في تحميل البنية الصحيحة في كل مرة. في البرنامج التعليمي التالي، ستتعرف على كيفية استخدام المُجزّئ اللغوي ومعالج الصور ومستخرج الميزات والمعالج الذي تم تحميله حديثًا لمعالجة مجموعة بيانات للضبط الدقيق.
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/ar/bertology.md b/docs/transformers/docs/source/ar/bertology.md
new file mode 100644
index 0000000000000000000000000000000000000000..d12d7838906ea46629481b26e7853bde7770988c
--- /dev/null
+++ b/docs/transformers/docs/source/ar/bertology.md
@@ -0,0 +1,18 @@
+# BERTology
+
+يُشهد في الآونة الأخيرة نمو مجال دراسي يُعنى باستكشاف آلية عمل نماذج المحولات الضخمة مثل BERT (والذي يُطلق عليها البعض اسم "BERTology"). ومن الأمثلة البارزة على هذا المجال ما يلي:
+
+- BERT Rediscovers the Classical NLP Pipeline بواسطة Ian Tenney و Dipanjan Das و Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
+- Are Sixteen Heads Really Better than One? بواسطة Paul Michel و Omer Levy و Graham Neubig: https://arxiv.org/abs/1905.10650
+- What Does BERT Look At? An Analysis of BERT's Attention بواسطة Kevin Clark و Urvashi Khandelwal و Omer Levy و Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341
+- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633
+
+لإثراء هذا المجال الناشئ، قمنا بتضمين بعض الميزات الإضافية في نماذج BERT/GPT/GPT-2 للسماح للناس بالوصول إلى التمثيلات الداخلية، والتي تم تكييفها بشكل أساسي من العمل الرائد لـ Paul Michel (https://arxiv.org/abs/1905.10650):
+
+- الوصول إلى جميع الحالات المخفية في BERT/GPT/GPT-2،
+- الوصول إلى جميع أوزان الانتباه لكل رأس في BERT/GPT/GPT-2،
+- استرجاع قيم ومشتقات  مخرجات الرأس لحساب درجة أهمية الرأس وحذفه كما هو موضح في https://arxiv.org/abs/1905.10650.
+
+ولمساعدتك على فهم واستخدام هذه الميزات بسهولة، أضفنا مثالًا برمجيًا محددًا: [bertology.py](https://github.com/huggingface/transformers-research-projects/tree/main/bertology/run_bertology.py) أثناء استخراج المعلومات  وتقليص من نموذج تم تدريبه مسبقًا على GLUE.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/chat_templating.md b/docs/transformers/docs/source/ar/chat_templating.md
new file mode 100644
index 0000000000000000000000000000000000000000..90f4ac820e146aa685cd89ec1d79f2749ae3434f
--- /dev/null
+++ b/docs/transformers/docs/source/ar/chat_templating.md
@@ -0,0 +1,835 @@
+# قوالب نماذج الدردشة
+
+## مقدمة
+
+تعد **الدردشة** أحد استخدامات نماذج اللغات الكبيرة (LLMs) شائعة الاستخدام بشكل متزايد. ففي سياق الدردشة، وبدلاً من متابعة سلسلة نصية واحدة (كما هو الحال مع نماذج اللغات القياسية)، يواصل النموذج بدلاً من ذلك محادثة تتكون من رسالة واحدة أو أكثر، تتضمن كل منها دورًا، مثل "المستخدم" أو "المساعد"، بالإضافة إلى نص الرسالة.
+
+وكما هو الحال مع تقسيم النص إلى رموز (tokenization)، تتوقع النماذج المختلفة تنسيقات إدخال مختلفة تمامًا للمحادثة. لهذا السبب أضفنا **قوالب الدردشة** كميزة جديدة. تُعد قوالب المحادثة جزءًا من tokenizer. تحدد هذه القوالب كيفية تحويل المحادثات، والتي يتم تمثيلها كقوائم من الرسائل، إلى سلسلة نصية واحدة قابلة للتقسيم إلى رموز بالتنسيق الذي يتوقعه النموذج.
+
+دعونا نجعل هذا ملموسًا بمثال سريع باستخدام نموذج `BlenderBot`. لدى BlenderBot قالب افتراضي بسيط للغاية، والذي يضيف في الغالب مسافات بيضاء بين جولات الحوار:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+
+>>> chat = [
+...    {"role": "user", "content": "Hello, how are you?"},
+...    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...    {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>>> tokenizer.apply_chat_template(chat, tokenize=False)
+" Hello, how are you?  I'm doing great. How can I help you today?   I'd like to show off how chat templating works!</s>"
+```
+
+لاحظ كيف تم ضغط الدردشة بأكملها في سلسلة واحدة. إذا استخدمنا `tokenize=True`، وهو الإعداد الافتراضي، فسيتم أيضًا تحليل السلسلة نحويًا نيابة عنا. ولكن، لنشاهد قالبًا أكثر تعقيدًا في العمل، دعونا نستخدم نموذج `mistralai/Mistral-7B-Instruct-v0.1`.
+
+```python
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+
+>>> chat = [
+...   {"role": "user", "content": "Hello, how are you?"},
+...   {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+...   {"role": "user", "content": "I'd like to show off how chat templating works!"},
+... ]
+
+>>> tokenizer.apply_chat_template(chat, tokenize=False)
+"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]</s>"
+```
+
+لاحظ كيف أضاف المجزىء اللغوى tokenizer رموز التحكم `[INST]` و `[/INST]` للإشارة إلى بداية ونهاية رسائل المستخدم (ولكن ليس رسائل المساعد!) ، وتم تكثيف المحادثة بأكملها في سلسلة نصية واحدة. إذا استخدمنا `tokenize=True` ، وهو الإعداد الافتراضي ، فسيتم أيضًا تقسيم تلك السلسلة إلى رموز.
+
+حاول الآن استخدام نفس الشفرة، لكن مع استبدال النموذج بـ `HuggingFaceH4/zephyr-7b-beta` ، وستحصل على:
+```text
+<|user|>
+Hello, how are you?</s>
+<|assistant|>
+I'm doing great. How can I help you today?</s>
+<|user|>
+I'd like to show off how chat templating works!</s>
+```
+تم ضبط كل من Zephyr و Mistral-Instruct من نفس النموذج الأصلي ، Mistral-7B-v0.1. ومع ذلك ، فقد تم تدريبهم بتنسيقات دردشة مختلفة تمامًا. بدون قوالب المحادثة، ستضطر إلى كتابة شفرة تنسيق يدويًا لكل نموذج ، ومن السهل جدًا ارتكاب أخطاء بسيطة تؤثر على الأداء! تُدير قوالب المحادثة تفاصيل التنسيق نيابةً عنك ، مما يُتيح لك كتابة شفرة عامة تعمل مع أي نموذج.
+
+## كيف أستخدم قوالب الدردشة؟
+
+كما رأيت في المثال السابق،  من السهل استخدام قوالب الدردشة. قم ببساطة بإنشاء قائمة من الرسائل، مع مفتاحي `role` و`content`، ثم قم بتمريرها إلى [`~PreTrainedTokenizer.apply_chat_template`] . بمجرد قيامك بذلك، ستحصل على مخرجات جاهزة للاستخدام! عند استخدام قوالب الدردشة كإدخال لتوليد نصوص بواسطة النموذج، فمن الجيد أيضًا استخدام `add_generation_prompt=True` لإضافة [مطالبات توليد النصوص](#what-are-generation-prompts).
+
+فيما يلي مثال على إعداد الإدخال لـ `model.generate()`، باستخدام Zephyr مرة أخرى:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "HuggingFaceH4/zephyr-7b-beta"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint) # قد ترغب في استخدام bfloat16 و/أو الانتقال إلى GPU هنا
+
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+print(tokenizer.decode(tokenized_chat[0]))
+```
+سيؤدي هذا إلى إنتاج سلسلة نصية بتنسيق الإدخال الذي يتوقعه Zephyr.
+
+```text
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+```
+
+الآن بعد أن تم تنسيق الإدخال بشكل صحيح لـ Zephyr، يمكننا استخدام النموذج لإنشاء رد على سؤال المستخدم:
+
+```python
+outputs = model.generate(tokenized_chat, max_new_tokens=128)
+print(tokenizer.decode(outputs[0]))
+```
+
+سيؤدي هذا إلى ما يلي:
+
+```text
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
+```
+
+كان ذلك سهلاً بعد كل شيء !
+
+
+
+## هل هناك قنوات معالجة أوتوماتيكية للدردشة؟
+
+نعم يوجد ! تدعم قنوات المعالجة توليد النصوص مدخلات الدردشة ، مما يُسهّل استخدام نماذج الدردشة . في الماضي ، كنا نستخدم فئة "ConversationalPipeline" المُخصّصة ، ولكن تم الآن إيقافها وتم دمج وظائفها في [`TextGenerationPipeline`]. دعونا نجرّب مثال Zephyr مرة أخرى ، ولكن هذه المرة باستخدام قناة معالجة:
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta")
+messages = [
+    {
+        "role": "system",
+        "content": "You are a friendly chatbot who always responds in the style of a pirate",
+    },
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+]
+print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # طباعة استجابة المساعد
+```
+
+```النص
+{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all."}
+```
+
+سيُراعي قناة المعالجة جميع تفاصيل تقسيم النص إلى رموز واستدعاء apply_chat_template نيابةً عنك - بمجرد أن يصبح لِدى النموذج قالب دردشة ، فكل ما تحتاج إلى القيام به هو تهيئة قناة معالجة وتمرير قائمة الرسائل إليها!
+
+## ما هي "مطالبات التوليد"؟
+
+قد تلاحظ أن طريقة `apply_chat_template` لها معامل `add_generation_prompt`. تخبر هذه المعامل القالب بإضافة رموز تشير إلى بداية رد البوت. على سبيل المثال، ضع في اعتبارك الدردشة التالية:
+
+```python
+messages = [
+    {"role": "user", "content": "Hi there!"},
+    {"role": "assistant", "content": "Nice to meet you!"},
+    {"role": "user", "content": "Can I ask a question?"}
+]
+```
+
+إليك كيف سيبدو ذلك بدون موجه توليد نصوص ، بالنسبة لنموذج يستخدم تنسيق "ChatML" القياسي :
+
+```python
+tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+"""
+```
+
+وهكذا يبدو الأمر **مع** مطالبة التوليد:
+
+```python
+tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+"""<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+<|im_start|>assistant
+"""
+```
+
+لاحظ أننا أضفنا هذه المرة الرموز التي تشير إلى بداية رد البوت. يضمن هذا أنه عندما يُولّد النموذج نصًا فسيكتب رد البوت بدلاً من القيام بشيء غير متوقع، مثل الاستمرار في رسالة المستخدم. تذكر، أن نماذج الدردشة لا تزال مجرد نماذج للغة - فهي مدربة على متابعة النصوص، والدردشة هي مجرد نوع خاص من النصوص بالنسبة لها! يجب توجيهها برموز تحكم مناسبة، حتى تعرف ما الذي يجب عليها فعله.
+
+لا تتطلب جميع النماذج الرموز التحكمية لتوليد نصوص . بعض النماذج ، مثل LLaMA ، ليس لديها أي رموز خاصة قبل ردود البوت . في هذه الحالات ، لن يكون لمعامل `add_generation_prompt` أي تأثير. يعتمد التأثير الدقيق الذي تُحدثه `add_generation_prompt` على القالب المستخدم .
+
+## ما وظيفة "continue_final_message"؟
+
+عند تمرير قائمة من الرسائل إلى `apply_chat_template` أو `TextGenerationPipeline` ، يمكنك اختيار تنسيق المحادثة بحيث يواصل النموذج الرسالة الأخيرة في المحادثة بدلاً من بدء رسالة جديدة. يتم ذلك عن طريق إزالة أي رموز نهاية التسلسل التي تشير إلى نهاية الرسالة الأخيرة ، بحيث يقوم النموذج ببساطة بتمديد الرسالة الأخيرة عندما يبدأ في توليد النص . يُعد هذا أمرًا مفيدًا "لِمَلء بداية" رد النموذج مُسبقًا.
+
+وهنا مثال:
+```python
+chat = [
+    {"role": "user", "content": "Can you format the answer in JSON?"},
+    {"role": "assistant", "content": '{"name": "'},
+]
+
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
+model.generate(**formatted_chat)
+```
+سيقوم النموذج بتوليد نص يكمل سلسلة JSON ، بدلاً من بدء رسالة جديدة . يمكن أن يكون هذا النهج مفيدًا جدًا لتحسين دقة اتباع النموذج للإرشادات عندما تعرف كيف تريد أن يبدأ ردوده .
+.
+
+نظرًا لأن `add_generation_prompt` تضيف الرموز التي تبدأ رسالة جديدة ، و `continue_final_message` تزيل أي رموز نهاية الرسالة من الرسالة الأخيرة ، فليس من المنطقي استخدامهما معًا . ونتيجة لذلك ، ستتلقّى خطأً إذا حاولت ذلك !
+
+السلوك الافتراضي لِـ `TextGenerationPipeline` هو تعيين `add_generation_prompt=True` بحيث تبدأ رسالة جديدة . ومع ذلك ، إذا كانت الرسالة الأخيرة في المحادثة التي تم إدخالها لديها دور "assistant" ، فسوف تفترض أن هذه الرسالة هي "مَلء بداية" وتتحوّل إلى `continue_final_message=True` بدلاً من ذلك ، لأن مُعظم النماذج لا تدعم عدة رسائل متتالية للمساعد . يمكنك تجاوز هذا السلوك عن طريق تمرير معامل `continue_final_message` بشكل صريح عند استدعاء قناة المعالجة .
+
+
+
+## هل يمكنني استخدام قوالب الدردشة في التدريب؟
+
+نعم ! تُعد هذه طريقة جيدة للتأكد من أن قالب الدردشة يتطابق مع الرموز التي يراها النموذج أثناء التدريب . نوصي بتطبيق قالب الدردشة كخطوة معالجة أولية لمجموعة بياناتك . بعد ذلك ، يمكنك ببساطة متابعة عملية التدريب كما هو الحال مع أي مهمة تدريب نماذج لغات أخرى . عند التدريب ، يجب أن تُعيّن عادةً  `add_generation_prompt=False` ، لأنه لن تكون الرموز المُضافة لتحفيز رد المساعد مفيدة أثناء التدريب . دعونا نرى مثالاً :
+
+```python
+from transformers import AutoTokenizer
+from datasets import Dataset
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+chat1 = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+    {"role": "assistant", "content": "The sun."}
+]
+chat2 = [
+    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+    {"role": "assistant", "content": "A bacterium."}
+]
+
+dataset = Dataset.from_dict({"chat": [chat1, chat2]})
+dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
+print(dataset['formatted_chat'][0])
+```
+ونحصل على:
+
+```text
+<|user|>
+Which is bigger, the moon or the sun?</s>
+<|assistant|>
+The sun.</s>
+```
+
+من هنا، استمر في التدريب كما تفعل مع مهمة نمذجة اللغة القياسية، باستخدام عمود `formatted_chat`.
+
+<Tip>
+بشكل افتراضي ، تضيف بعض *tokenizers* رموزًا خاصة مثل `<bos>` و `<eos>` إلى النص الذي تقوم بتقسيمه إلى رموز. يجب أن تتضمن قوالب المحادثة بالفعل جميع الرموز الخاصة التي تحتاجها ، وبالتالي فإن الرموز الخاصة الإضافية ستكون غالبًا غير صحيحة أو مُكررة ، مما سيؤثر سلبًا على أداء النموذج .
+
+لذلك ، إذا قمت بتنسيق النص باستخدام  `apply_chat_template(tokenize=False)` ، فيجب تعيين المعامل `add_special_tokens=False` عندما تقوم بتقسيم ذلك النص إلى رموز لاحقًا . إذا كنت تستخدم `apply_chat_template(tokenize=True)` ، فلن تحتاج إلى القلق بشأن ذلك !
+</Tip>
+
+## متقدّم: مدخلات إضافية لِقوالب الدردشة
+
+
+المعامل الوحيدة التي تتطلبها طريقة `apply_chat_template` هي `messages`. ومع ذلك، يمكنك تمرير أي معامل ككلمة مفتاحية إلى `apply_chat_template` وستكون متاحة داخل القالب. يمنحك هذا الكثير من المرونة لاستخدام قوالب الدردشة للعديد من الأشياء. لا توجد قيود على أسماء هذه المعامﻻت أو تنسيقاتها - يمكنك تمرير سلاسل نصية أو قوائم أو قواميس أو أي شيء آخر تريده.
+
+ومع ذلك، هناك بعض الحالات الشائعة لاستخدام هذه المعامﻻت الإضافية، مثل تمرير أدوات لاستدعاء الوظائف، أو المستندات  لإنشاء النصوص المُعزّزة بالاسترجاع. في هذه الحالات الشائعة، لدينا بعض التوصيات المُحدّدة حول أسماء هذه المعامﻻت وتنسيقاتها، والتي يتم وصفها في الأقسام التالية. نشجع مطوّري النماذج على جعل قوالب الدردشة الخاصة بهم متوافقة مع هذا التنسيق، لتسهيل نقل التعليمات البرمجية لاستدعاء الأدوات بين النماذج.
+
+## متقدم: استخدام الأداة / استدعاء الدالة
+
+يمكن لنماذج "استخدام الأداة" اختيار استدعاء الدوال كأدوات خارجية قبل توليد الإجابة. عند تمرير الأدوات إلى نموذج استخدام الأدوات، يمكنك ببساطة تمرير قائمة من الوظائف إلى معامل `tools`:
+
+```python
+import datetime
+
+def current_time():
+    """Get the current local time as a string."""
+    return str(datetime.now())
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+tools = [current_time, multiply]
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools=tools
+)
+```
+
+لكي يعمل هذا بشكل صحيح، يجب عليك كتابة وظائفك بالتنسيق السابق، حتى يمكن تحليلها بشكل صحيح كأدوات. على وجه التحديد، يجب عليك اتباع هذه القواعد:
+
+- يجب أن يكون للدالة اسم وصفي.
+- يجب أن يكون لكل معامل نوع للتلميح.
+- يجب أن تحتوي الدالة على سلسلة مستندية بتنسيق Google القياسي (بمعنى وصف الدالة الأولي متبوعًا بكتلة `Args:` التي تصف المعاﻻت، ما لم تكن الدالة لا تحتوي على أي معامﻻت.
+- لا تقم بتضمين الأنواع في كتلة `Args:` . بعبارة أخرى، اكتب `a: The first number to multiply`، وليس `a (int): The first number to multiply`. يجب أن تذهب تلميحات الأنواع في رأس الدالة بدلاً من ذلك.
+- يمكن أن يكون للدالة نوع للإرجاع ومربع `Returns:` في السلسلة. ومع ذلك، فهذه اختيارية لأن معظم نماذج استخدام الأدوات تتجاهلها.
+
+### تمرير نتائج الأداة إلى النموذج
+
+يكفي الكود السابقة لسرد الأدوات المتاحة لنموذجك، ولكن ماذا يحدث إذا أراد النموذج استخدام واحدة منها؟ إذا حدث ذلك، فيجب عليك:
+
+1. تحليل مخرجات النموذج للحصول على اسم (أسماء) الأدوات ومعامﻻتها.
+2. أضف استدعاء (استدعاءات) النموذج لِلأدوات إلى المحادثة.
+3. استدعاء الدالة (الدالات) المقابلة بتلك المعامﻻت.
+4. أضف النتيجة (النتائج) إلى المحادثة
+
+### مثال كامل على استخدام الأداة
+
+
+سنستعرض مثالاً على استخدام الأدوات خطوة بخطوة . في هذا المثال ، سنستخدم نموذج `Hermes-2-Pro` بحجم 8 مليارات معامل ، نظرًا لأنه أحد أعلى نماذج استخدام الأدوات أداءً في فئة حجمه وقت كتابة هذا النص . إذا كان لديك الذاكرة الكافية ، فيمكنك النظر في استخدام نموذج أكبر بدلاً من ذلك مثل `Command-R` أو `Mixtral-8x22B` ، وكلاهما يدعم استخدام الأدوات ويوفر أداءً أقوى .
+
+
+أولاً ، لنقم بتحميل نموذجنا و tokenizer الخاص بنا:
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+
+```python
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+الآن، لنقم نطبق قالب الدردشة ونولد رد:
+
+```python
+inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+ونحصل على:
+
+```text
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+لقد قام النموذج باستدعاء الدالة مع معامﻻت صحيحة، بالصيغة التي طلبتها توثيق الدالة. لقد استنتج أننا نشير على الأرجح إلى باريس في فرنسا، وتذكر أنه بكونها موطن وحدات القياس الدولية، يجب عرض درجة الحرارة في فرنسا بالدرجة المئوية.
+
+دعنا نضيف استدعاء الأداة الخاص بالنموذج إلى المحادثة. لاحظ أننا نولد معرف استدعاء أداة عشوائيًا هنا. لا تستخدم جميع النماذج هذه المعرفات، ولكنها تسمح للنماذج بإصدار عدة استدعاءات للأدوات في نفس الوقت وتتبع الاستجابة المقابلة لكل استدعاء. يمكنك توليد هذه المعرفات بأي طريقة تريدها، ولكن يجب أن تكون فريدة داخل كل محادثة.
+
+```python
+tool_call_id = "vAHdf3"  # Random ID, should be unique for each tool call
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"id": tool_call_id, "type": "function", "function": tool_call}]})
+```
+
+الآن بعد أن أضفنا استدعاء الأداة إلى المحادثة، يمكننا استدعاء الدالة وإضافة النتيجة إلى المحادثة. نظرًا لأننا نستخدم دالة وهمية لهذا المثال والتي تعيد دائمًا 22.0، فيمكننا ببساطة إضافة تلك النتيجة مباشرةً. لاحظ معرف استدعاء الأداة - يجب أن يتطابق مع المعرف المستخدم في استدعاء الأداة أعلاه.
+
+```python
+messages.append({"role": "tool", "tool_call_id": tool_call_id, "name": "get_current_temperature", "content": "22.0"})
+```
+
+أخيرًا، دعنا نجعل المساعد يقرأ مخرجات الدالة ويكمل الدردشة مع المستخدم:
+
+```python
+inputs = tokenizer.apply_chat_template(messages, chat_template="tool_use", tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v.to(model.device) for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+ونحصل على:
+
+```text
+The current temperature in Paris, France is 22.0 ° Celsius.<|im_end|>
+```
+
+<Tip>
+لا تستخدم جميع نماذج استخدام الأدوات جميع ميزات استدعاء الأدوات الموضحة أعلاه. يستخدم البعض معرفات استدعاء الأدوات، بينما يستخدم البعض الآخر ببساطة اسم الدالة ويقارن استدعاءات الأدوات بالنتائج باستخدام الترتيب، وهناك عدة نماذج لا تستخدم أيًا منهما ولا تصدر سوى استدعاء أداة واحد في كل مرة لتجنب الارتباك. إذا كنت تريد أن يكون رمزك متوافقًا مع أكبر عدد ممكن من النماذج، فإننا نوصي بهيكلة استدعاءات الأدوات الخاصة بك كما هو موضح هنا، وإعادة نتائج الأدوات بالترتيب الذي أصدرها النموذج. يجب أن تتعامل قوالب الدردشة على كل نموذج مع الباقي.
+</Tip>
+
+### فهم مخططات الأدوات
+
+يتم تحويل كل دالة تقوم بتمريرها إلى معامل `tools` في دالة `apply_chat_template` إلى [مخطط JSON](https://json-schema.org/learn/getting-started-step-by-step). يتم بعد ذلك تمرير هذه المخططات إلى قالب الدردشة النموذج. وبعبارة أخرى، فإن نماذج استخدام الأدوات لا ترى دوالك مباشرة، ولا ترى مطلقًا الكود الموجود بداخلها. ما يهمها هو**تعريفات** الدوال و**المعامﻻت** التي تحتاج إلى تمريرها إليها - فهي تهتم بما تفعله الأدوات وكيفية استخدامها، وليس بكيفية عملها! يقع على عاتقك قراءة مخرجاتها، والكشف عما إذا كانت قد طلبت استخدام أداة، وتمرير المعامﻻت إلى دالة الأداة، وإرجاع الرد في الدردشة.
+
+يجب أن يكون إنشاء مخططات JSON لتمريرها إلى القالب تلقائيًا وغير مرئي طالما أن دوالك تتبع المواصفات الموضحة أعلاه، ولكن إذا واجهت مشكلات، أو إذا كنت تريد ببساطة مزيدًا من التحكم في التحويل، فيمكنك التعامل مع التحويل يدويًا. فيما يلي مثال على تحويل مخطط يدوي:
+
+```python
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+سيؤدي هذا إلى ما يلي:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+إذا كنت ترغب في ذلك، يمكنك تحرير هذه المخططات، أو حتى كتابتها من البداية بنفسك دون استخدام `get_json_schema` على الإطلاق. يمكن تمرير مخططات JSON مباشرةً إلى معامل  `tools` في `apply_chat_template` - يمنحك هذا الكثير من القوة لتعريف مخططات دقيقة لوظائف أكثر تعقيدًا. ولكن كن حذرًا - كلما زاد تعقيد مخططاتك، زاد احتمال ارتباك النموذج عند التعامل معها! نوصي  بتوقيعات دوال بسيطة حيثما أمكن، مع تقليل المعامﻻت (وخاصة المعامﻻت المعقدة والمتداخلة) إلى الحد الأدنى.
+
+فيما يلي مثال على تعريف المخططات يدويًا، وتمريرها مباشرةً إلى `apply_chat_template`:
+
+```python
+# A simple function that takes no arguments
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# A more complete function that takes two numerical arguments
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## متقدم: توليد قائم على الاسترجاع
+يمكن لنماذج اللغة الكبيرة من نوع "توليد قائم على الاسترجاع" أو "RAG" البحث في مجموعة نصوص عن معلومات قبل الرد على الاستعلام. يسمح هذا للنماذج بتوسيع قاعدة معارفها بشكل كبير إلى ما هو أبعد من حجم سياقها المحدود. توصيتنا لنماذج RAG هي أن يقبل قالبها وسيطة `documents`. يجب أن تكون هذه قائمة من المستندات، حيث يكون كل "مستند" عبارة عن قاموس واحد بمفاتيح `title` و `contents`، وكلاهما سلاسل نصية. نظرًا لأن هذا التنسيق أبسط بكثير من مخططات JSON المستخدمة للأدوات، فلا توجد حاجة إلى دوال مساعدة.
+
+فيما يلي مثال على قالب RAG بالفعل:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# تحميل النموذج والمجزىء اللغوي
+model_id = "CohereForAI/c4ai-command-r-v01-4bit"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
+device = model.device # الحصول على الجهاز الذي تم تحميل النموذج عليه
+
+# تعريف مُدخلات المحادثة
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+# تعريف المستندات لتوليد قائم على الاسترجاع
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+# معالجة المحادثة والمستندات باستخدام قالب RAG، وإرجاع موترات PyTorch.
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# توليد الرد
+gen_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# فك تشفير النص المُوَلّد وطباعته
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+إن مُدخل documents للتوليد القائم على الاسترجاع غير مدعوم على نطاق واسع، والعديد من النماذج لديها قوالب دردشة تتجاهل هذا المُدخل ببساطة.
+
+للتحقق مما إذا كان النموذج يدعم مُدخل `documents`، يمكنك قراءة بطاقة النموذج الخاصة به، أو `print(tokenizer.chat_template)` لمعرفة ما إذا كان مفتاح `documents` مستخدمًا في أي مكان.
+<Tip>
+ومع ذلك، فإن أحد فئات النماذج التي تدعمه هي [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) و [Command-R+](https://huggingface.co/CohereForAI/c4ai-command-r-pluse-08-2024) من Cohere، من خلال قالب الدردشة rag الخاص بهم. يمكنك رؤية أمثلة إضافية على التوليد باستخدام هذه الميزة في بطاقات النموذج الخاصة بهم.
+</Tip>
+
+## متقدم: كيف تعمل قوالب الدردشة؟
+يتم تخزين قالب الدردشة للنموذج في الخاصية `tokenizer.chat_template`. إذا لم يتم تعيين قالب دردشة، فسيتم استخدام القالب الافتراضي لفئة النموذج هذه بدلاً من ذلك. دعونا نلقي نظرة على قالب دردشة `Zephyr`، ولكن لاحظ أن هذا القالب مُبسّط قليلاً عن القالب الفعلي!
+
+```
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}
+```
+إذا لم تكن قد رأيت أحد هذه القوالب من قبل، فهذا [قالب Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) .Jinja هي لغة قوالب تسمح لك بكتابة تعليمات برمجية بسيطة تُوَلّد نصًا. من نواحٍ عديدة، يُشبه الرمز والتركيب للغة Python. أما في لغة Python، سيبدو هذا القالب كما يلي:
+
+```python
+for message in messages:
+    print(f'<|{message["role"]}|>')
+    print(message['content'] + eos_token)
+if add_generation_prompt:
+    print('<|assistant|>')
+```
+يقوم القالب بثلاثة أشياء بشكل فعال:
+
+- لكل رسالة، بطبع الدور مُحاطًا بـ `<|` و `|>`، مثل `<|user|>` أو `<|assistant|>`.
+- بعد ذلك، يطبع محتوى الرسالة، متبوعًا برمز نهاية التسلسل `eos_token` .
+- أخيرًا، إذا تم تعيين `add_generation_prompt` ، يطبع الرمز المساعد، حتى يعرف النموذج أنه يجب أن يبدأ في توليد استجابة المساعد.
+  
+هذا قالب بسيط جدًا، لكن Jinja تمنحك الكثير من المرونة للقيام بأشياء أكثر تعقيدًا! دعونا نرى قالب Jinja يُمكنه تنسيق المُدخلات بطريقة تُشبه الطريقة التي تُنسّق بها LLaMA مُدخلاتها (لاحظ أن قالب LLaMA الحقيقي يتضمن معالجة لرسائل النظام الافتراضية ومعالجة رسائل النظام بشكل مختلف قليلاً بشكل عام - لا تستخدم هذا القالب في التعليمات البرمجية الفعلية الخاصة بك!)
+```
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'] + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'] + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' '  + message['content'] + ' ' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+نأمل أنه إذا حدقت في هذا لفترة قصيرة، يمكنك أن ترى ما يفعله هذا القالب - فهو يُضيف رموزًا مُحددة مثل `[INST]` و `[/INST]` بناءً على دور كل رسالة. يمكن تمييز رسائل المستخدم والمساعد والنظام بوضوح للنموذج بسبب الرموز التي تُحيط بها.
+
+## متقدم: إضافة وتعديل قوالب الدردشة
+
+### كيف أنشئ قالب دردشة؟
+ببساطة، اكتب قالب Jinja واضبط `tokenizer.chat_template`. قد تجد أنه من الأسهل البدء بقالب موجود من نموذج آخر وتحريره ببساطة ليناسب احتياجاتك! على سبيل المثال، يمكننا أن نأخذ قالب LLaMA أعلاه ونضيف `[ASST]` و `[/ASST]` إلى رسائل المساعد:
+
+```
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+
+الآن، اضبط ببساطة الخاصية `tokenizer.chat_template`. في المرة القادمة التي تستخدم فيها [`~PreTrainedTokenizer.apply_chat_template`] ، سيستخدم القالب الجديد الخاص بك! سيتم حفظ هذه الخاصية في ملف `tokenizer_config.json`، حتى تتمكن من استخدام  [`~utils.PushToHubMixin.push_to_hub`] لتحميل قالبك الجديد إلى Hub والتأكد من أن الجميع يستخدم القالب الصحيح لنموذجك!
+
+```python
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # تغيير رمز النظام
+tokenizer.chat_template = template  # تعيين القالب الجديد
+tokenizer.push_to_hub("model_name")  # تحميل القالب الجديد إلى Hub!
+```
+
+يتم استدعاء الدالة [`~PreTrainedTokenizer.apply_chat_template`] الذي نستخدم قالب الدردشة الخاص بك بواسطة فئة [`TextGenerationPipeline`] لذلك بمجرد تعيين قالب الدردشة الصحيح، سيصبح نموذجك متوافقًا تلقائيًا مع [`TextGenerationPipeline`].
+
+<Tip>
+إذا كنت تُجري ضبطًا دقيقًا لنموذج للدردشة، بالإضافة إلى تعيين قالب دردشة، فربما يجب عليك إضافة أي رموز تحكم دردشة جديدة كرموز خاصة في المجزىء اللغوي. لا يتم تقسيم الرموز الخاصة أبدًا، مما يضمن معالجة رموز التحكم الخاصة بك دائمًا كرموز فردية بدلاً من تجزئتها إلى أجزاء. يجب عليك أيضًا تعيين خاصية `eos_token` للمجزىء اللغوي إلى الرمز الذي يُشير إلى نهاية توليدات المساعد في قالبك. سيضمن هذا أن أدوات توليد النصوص يمكنها تحديد وقت إيقاف توليد النص بشكل صحيح.
+</Tip>
+
+### لماذا تحتوي بعض النماذج على قوالب متعددة؟
+تستخدم بعض النماذج قوالب مختلفة لحالات استخدام مختلفة. على سبيل المثال، قد تستخدم قالبًا واحدًا للدردشة العادية وآخر لاستخدام الأدوات، أو التوليد القائم على الاسترجاع. في هذه الحالات، تكون `tokenizer.chat_template` قاموسًا. يمكن أن يتسبب هذا في بعض الارتباك، وحيثما أمكن، نوصي باستخدام قالب واحد لجميع حالات الاستخدام. يمكنك استخدام عبارات Jinja مثل `if tools is defined` وتعريفات `{% macro %}` لتضمين مسارات تعليمات برمجية متعددة بسهولة في قالب واحد.
+
+عندما يحتوي المعالج اللغوي على قوالب متعددة، ستكون `tokenizer.chat_template dict`، حيث يكون كل مفتاح هو اسم قالب. يحتوي أسلوب `apply_chat_template` على معالجة خاصة لأسماء قوالب مُعينة: على وجه التحديد، سيبحث عن قالب باسم `default` في معظم الحالات، وسيُثير خطأً إذا لم يتمكن من العثور على واحد. ومع ذلك، إذا كان هناك قالب باسم `tool_use` عندما قام المستخدم بتمرير وسيطة `tools`، فسيستخدم هذا القالب بدلاً من ذلك. للوصول إلى قوالب بأسماء أخرى، مرر اسم القالب الذي تُريده إلى وسيطة `chat_template` لـ `apply_chat_template()`.
+
+نجد أن هذا قد يكون مُربكًا بعض الشيء للمستخدمين - لذلك إذا كنت تكتب قالبًا بنفسك، فننصحك بمحاولة وضعه كله في قالب واحد حيثما أمكن!
+
+## ما القالب الذي يجب أن أستخدمه؟
+
+عند تعيين قالب لنموذج تم تدريبه بالفعل على الدردشة، يجب التأكد من أن القالب يتطابق تمامًا مع تنسيق الرسالة الذي شاهده النموذج أثناء التدريب، وإلا فمن المحتمل أن تواجه تدهورًا في الأداء. هذا صحيح حتى إذا كنت تدرب النموذج بشكل إضافي - فمن المحتمل أن تحصل على أفضل أداء إذا قمت بإبقاء رموز الدردشة ثابتة.  يُشبه هذا إلى حد كبير عملية التجزئة - فأنت تحصل بشكل عام على أفضل أداء للاستدلال أو الضبط الدقيق عندما تتطابق بدقة مع التجزئة المستخدمة أثناء التدريب.
+
+من ناحية أخرى، إذا كنت تُدرّب نموذجًا من البداية، أو تقوم بضبط دقيق لنموذج لغة أساسي للدردشة، لديك حرية اختيار قالب مناسب! تتمتع LLMs بالذكاء الكافي للتعامل مع العديد من تنسيقات الإدخال المختلفة. أحد الخيارات الشائعة هو تنسيق "ChatML"، وهو خيار جيد ومرن للعديد من حالات الاستخدام. يبدو كالتالي:
+
+```
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+```
+
+إذا أعجبك هذا، فإليك نسخة جاهزة لوضعها في كودك. يتضمن الخط المفرد أيضًا دعمًا مفيدًا [لإرشادات التوليد](#what-are-generation-prompts)، ولكن لاحظ أنه لا يضيف رموز BOS أو EOS! إذا كان نموذجك يتوقع هذه الرموز، فلن يتم إضافتها تلقائيًا بواسطة "apply_chat_template" - بمعنى آخر، سيتم تجزئة النص باستخدام "add_special_tokens=False". هذا لتجنب التعارضات المحتملة بين القالب ومنطق "add_special_tokens". إذا كان نموذجك يتوقع رموزًا خاصة، فتأكد من إضافتها إلى القالب!
+
+```python
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+يُحيط هذا القالب كل رسالة بين الرمزين "<|im_start|>" و "<|im_end|>"، ويكتب ببساطة الدور كسلسلة نصية، مما يسمح بالمرونة في الأدوار التي تتدرب عليها. يبدو الناتج كما يلي:
+
+```text
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+تعد أدوار "user" و "system" و "assistant" هي الأدوار القياسية للدردشة، ونوصي باستخدامها عندما يكون ذلك منطقيًا، خاصة إذا كنت تريد أن يعمل نموذجك بشكل جيد مع [`TextGenerationPipeline`]. ومع ذلك، فأنت لست مقيدًا بهذه الأدوار - فإن القوالب مرنة للغاية، ويمكن أن تكون أي سلسلة نصية دورًا.
+
+
+## أريد إضافة بعض قوالب الدردشة! كيف أبدأ؟
+
+إذا كان لديك أي نماذج دردشة، فيجب عليك تعيين الخاصية "tokenizer.chat_template" الخاصة بها واختبارها باستخدام [`~PreTrainedTokenizer.apply_chat_template`]، ثم رفع  المجزىء اللغوي المُحدّث إلى Hub. ينطبق هذا حتى إذا لم تكن مالك النموذج - إذا كنت تستخدم نموذجًا بقالب دردشة فارغ، أو لا يزال يستخدم قالب الفئة الافتراضية، فيرجى فتح [طلب سحب](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)  إلى مستودع النموذج حتى يمكن تعيين الخاصية بشكل صحيح!
+
+بمجرد تعيين الخاصية، هذا كل شيء، لقد انتهيت! ستعمل "tokenizer.apply_chat_template" الآن بشكل صحيح لهذا النموذج، مما يعني أنها مدعومة أيضًا بشكل تلقائي في أماكن مثل "TextGenerationPipeline"!
+
+من خلال ضمان امتلاك النماذج لهذه الخاصية، يُمكننا التأكد من أن المجتمع بأكمله يستخدم القوة الكاملة للنماذج مفتوحة المصدر. لقد كانت عدم تطابق التنسيق تطارد المجال وأضرت الأداء بصمت لفترة طويلة جدًا - لقد حان الوقت لوضع حد لها!
+
+## متقدم: نصائح لكتابة القوالب
+
+<Tip>
+أسهل طريقة للبدء في كتابة قوالب Jinja هي إلقاء نظرة على بعض القوالب الموجودة. يمكنك استخدام `print(tokenizer.chat_template)` لأي نموذج دردشة لمعرفة القالب الذي يستخدمه. بشكل عام، تحتوي النماذج التي تدعم استخدام الأدوات على قوالب أكثر تعقيدًا بكثير من النماذج الأخرى - لذلك عندما تبدأ للتو، فمن المحتمل أنها مثال سيئ للتعلم منه! يمكنك أيضًا إلقاء نظرة على [وثائق Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) للحصول على تفاصيل حول تنسيق Jinja العام وتركيبه.
+
+</Tip>
+
+تُطابق قوالب Jinja في `transformers` قوالب Jinja في أي مكان آخر. الشيء الرئيسي الذي يجب معرفته هو أن سجل الدردشة سيكون متاحًا داخل قالبك كمتغير يسمى `messages`. ستتمكن من الوصول إلى `messages` في قالبك تمامًا كما يمكنك في Python، مما يعني أنه يمكنك التكرار خلاله باستخدام `{% for message in messages %}` أو الوصول إلى رسائل فردية باستخدام `{{ messages[0] }}`، على سبيل المثال.
+
+يمكنك أيضًا استخدام النصائح التالية لكتابة قوالب Jinja نظيفة وفعالة:
+
+### إقتطاع المسافات الفارغة
+
+بشكل افتراضي، ستطبع Jinja أي مسافات فارغة تأتي قبل أو بعد كتلة. يمكن أن يكون هذا مشكلة لقوالب الدردشة، والتي تريد عادةً أن تكون دقيقة جدًا مع المسافات! لتجنب ذلك، نوصي بشدة بكتابة قوالبك على النحو التالي:
+
+```
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```
+
+بدلاً من ذلك:
+
+```
+{% for message in messages %}
+    {{ message['role'] + message['content'] }}
+{% endfor %}
+```
+
+سيؤدي إضافة "-" إلى إزالة أي مسافات تأتي قبل الكتلة. يبدو المثال الثاني عادية، ولكن قد يتم تضمين السطر الجديد والمسافة البادئة في المخرجات، وهو على الأرجح ليس ما تُريده!
+
+
+### المتغيرات الخاصة
+
+ داخل قالبك، سيكون لديك حق الوصول إلى العديد من المتغيرات الخاصة. أهمها هو `messages`، والذي يحتوي على سجل الدردشة كقائمة من قواميس الرسائل. ومع ذلك، هناك العديد من المتغيرات الأخرى. لن يتم استخدام كل متغير في كل قالب. المتغيرات الأكثر شيوعًا هي:
+
+- `tools` تحتوي على قائمة بالأدوات بتنسيق مخطط JSON. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي أدوات.
+- `documents` تحتوي على قائمة من المستندات بالتنسيق `{"title": "العنوان", "contents": "المحتويات"}`، تُستخدم للتوليد المُعزز بالاسترجاع. ستكون `None` أو غير مُعرّفة إذا لم يتم تمرير أي مستندات.
+- `add_generation_prompt` هي قيمة منطقية تكون `True` إذا طلب المستخدم مُطالبة توليد، و `False` بخلاف ذلك. إذا تم تعيين هذا، فيجب أن يُضيف قالبك رأس رسالة مساعد إلى نهاية المحادثة. إذا لم يكن لدى نموذجك رأس مُحدد لرسائل المساعد، فيمكنك تجاهل هذا العلم.
+- **الرموز الخاصة** مثل `bos_token` و `eos_token`. يتم استخراجها من `tokenizer.special_tokens_map`. ستختلف الرموز الدقيقة المتاحة داخل كل قالب اعتمادًا على المجزىء اللغوي الأصلي.
+
+
+<Tip>
+
+يمكنك في الواقع تمرير أي `kwarg` إلى `apply_chat_template`، وستكون متاحة داخل القالب كمتغير. بشكل عام، نوصي بمحاولة الالتزام بالمتغيرات الأساسية المذكورة أعلاه، لأن ذلك سيجعل نموذجك أكثر صعوبة في الاستخدام إذا كان على المستخدمين كتابة تعليمات برمجية مخصصة لتمرير `kwargs` خاصة بالنموذج. ومع ذلك، فنحن نُدرك أن هذا المجال يتحرك بسرعة، لذلك إذا كانت لديك حالة استخدام جديدة لا تتناسب مع واجهة برمجة التطبيقات الأساسية، فلا تتردد في استخدام `kwarg`  معامل جديد لها! إذا أصبح `kwarg` المعامل الجديد شائعًا، فقد نقوم بترقيته إلى واجهة برمجة التطبيقات الأساسية وإنشاء  وتوثيق الخاص به.
+
+</Tip>
+
+### دوال قابلة للاستدعاء
+
+هناك أيضًا قائمة قصيرة من الدوال القابلة للاستدعاء المتاحة لك داخل قوالبك. هذه هي:
+
+- `raise_exception(msg)`: تُثير `TemplateException`. هذا مفيد لتصحيح الأخطاء، ولإخبار المستخدمين عندما يفعلون شيئًا لا يدعمه قالبك.
+- `strftime_now(format_str)`: تُكافئ `datetime.now().strftime(format_str)` في Python. يُستخدم هذا للحصول على التاريخ/الوقت الحالي بتنسيق مُحدد، والذي يتم تضمينه أحيانًا في رسائل النظام.
+
+### التوافق مع Jinja غير Python
+
+هناك تطبيقات متعددة لـ Jinja بلغات مختلفة. عادة ما يكون لها نفس التركيب، ولكن الاختلاف الرئيسي هو أنه عند كتابة قالبًا في Python، يمكنك استخدام أساليب Python، مثل ".lower()" على السلاسل أو ".items()" على القواميس. سيؤدي هذا إلى كسر إذا حاول شخص ما استخدام قالبك في تنفيذ غير Python لـ Jinja. تعد التطبيقات غير Python شائعة بشكل خاص في بيئات النشر، حيث تعد JS و Rust شائعة جدًا.
+
+لا تقلق، على الرغم من ذلك! هناك بعض التغييرات البسيطة التي يمكنك إجراؤها على قوالبك لضمان توافقها عبر جميع تطبيقات Jinja:
+
+- استبدل أساليب Python بمرشحات Jinja. عادة ما يكون لها نفس الاسم، على سبيل المثال، يصبح "string.lower()" عبارة عن "string|lower"، ويصبح "dict.items()" عبارة عن "dict|items". أحد التغييرات الملحوظة هو أن "string.strip()" يصبح "string|trim". راجع [قائمة المرشحات المدمجة](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) في وثائق Jinja لمزيد من المعلومات.
+- استبدل "True" و "False" و "None"، وهي خاصة بـ Python، بـ "true" و "false" و "none".
+- قد يؤدي عرض قاموس أو قائمة مباشرة إلى نتائج مختلفة في التطبيقات الأخرى (على سبيل المثال، قد تتغير  مدخﻻت السلسلة النصية من علامات اقتباس مفردة ' إلى علامات اقتباس مزدوجة "). يمكن أن يساعد إضافة "tojson" في ضمان الاتساق هنا.
+
+## كتابة مطالبات التوليد
+لقد ذكرنا أعلاه أن add_generation_prompt هو متغير خاص يمكن الوصول إليه داخل قالبك، ويتحكم فيه المستخدم من خلال تعيين معامل add_generation_prompt. إذا كان نموذجك يتوقع عنوان لرسائل المساعد، فيجب أن يدعم قالبك إضافة العنوان عند تعيين add_generation_prompt.
+
+فيما يلي مثال على قالب يُنسّق الرسائل بأسلوب ChatML، مع دعم مُطالبة التوليد:
+
+```text
+{{- bos_token }}
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
+```
+سيعتمد المحتوى الدقيق لعنوان المساعد على نموذجك المُحدد، ولكن يجب أن يكون دائمًا السلسلة النصية التي تُمثل بداية رسالة المساعد، بحيث إذا قام المستخدم بتطبيق قالبك باستخدام add_generation_prompt=True ثم قام بتوليد نص، سيكتب النموذج استجابة المساعد. لاحظ أيضًا أن بعض النماذج لا تحتاج إلى مُطالبة توليد، لأن رسائل المساعد تبدأ دائمًا فورًا بعد رسائل المستخدم. هذا شائع بشكل خاص لنماذج LLaMA و Mistral، حيث تبدأ رسائل المساعد فورًا بعد رمز [/INST] الذي ينهي رسائل المستخدم. في هذه الحالات، يمكن للقالب تجاهل معامل add_generation_prompt.
+
+مُطالبات التوليد مُهمة! إذا كان نموذجك يتطلب مُطالبة توليد ولكنها غير مُعيّنة في القالب، فمن المُحتمل أن تتدهور عمليات توليد النموذج بشدة، أو قد يُظهر النموذج سلوكًا غير عادي مثل متابعة رسالة المستخدم الأخيرة!
+
+### كتابة قوالب أكبر وتصحيحها
+عندما تم تقديم هذه الميزة، كانت معظم القوالب صغيرة جدًا، أي ما يُعادل نص برمجي "من سطر واحد" في Jinja. ومع ذلك، مع النماذج والميزات الجديدة مثل استخدام الأدوات و RAG، يمكن أن يصل طول بعض القوالب إلى 100 سطر أو أكثر. عند كتابة قوالب كهذه، من الجيد كتابتها في ملف مُنفصل، باستخدام مُحرر نصوص. يمكنك بسهولة استخراج قالب دردشة إلى ملف:
+
+```python
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+أو تحميل القالب المُحرر مرة أخرى إلى المعالج اللغوي:
+
+```python
+tokenizer.chat_template = open("template.jinja").read()
+```
+كميزة إضافية، عندما تكتب قالبًا طويلاً متعدد الأسطر في ملف مُنفصل، ستتوافق أرقام الأسطر في هذا الملف تمامًا مع أرقام الأسطر في أخطاء تحليل القالب أو تنفيذه. سيُسهّل هذا كثيرًا تحديد مكان المشكلات.
+
+### كتابة قوالب للأدوات
+على الرغم من أن قوالب الدردشة لا تفرض واجهة برمجة تطبيقات مُحددة للأدوات (أو لأي شيء حقًا)، فإننا نوصي مؤلفي القوالب بمحاولة الالتزام بواجهة برمجة تطبيقات قياسية حيثما أمكن. الهدف النهائي لقوالب الدردشة هو السماح بنقل التعليمات البرمجية عبر النماذج، لذا فإن الانحراف عن واجهة برمجة تطبيقات الأدوات القياسية يعني أن المستخدمين سيضطرون إلى كتابة تعليمات برمجية مخصصة لاستخدام الأدوات مع نموذجك. في بعض الأحيان يكون ذلك أمرًا لا مفر منه، ولكن غالبًا ما يكون من الممكن استخدام واجهة برمجة التطبيقات القياسية من خلال استخدام قوالب ذكية!
+
+أدناه، سنُدرج عناصر واجهة برمجة التطبيقات القياسية، ونقدم نصائح حول كتابة قوالب ستعمل بشكل جيد معها.
+
+#### تعريفات الأدوات
+يجب أن يتوقع قالبك أن يكون المتغير tools إما فارغًا (إذا لم يتم تمرير أي أدوات)، أو قائمة من قواميس مخطط JSON. تسمح أساليب قالب الدردشة الخاصة بنا للمستخدمين بتمرير الأدوات إما كمخطط JSON أو كدوال Python، ولكن عندما يتم تمرير الدوال، فإننا نقوم تلقائيًا بإنشاء مخطط JSON وتمريره إلى قالبك. نتيجة لذلك، سيكون متغير tools الذي يستقبله قالبك دائمًا قائمة من مخططات JSON. هنا مخطط JSON أداة نموذجي:
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "دالة تضرب عددين", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "الرقم الأول للضرب"
+        }, 
+        "b": {
+          "type": "number", 
+          "description": "الرقم الثاني للضرب"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+وهنا بعض الأمثلة البرمجية للتعامل مع الأدوات في قالب الدردشة الخاص بك. تذكر أن هذا مجرد مثال لتنسيق مُحدد - من المحتمل أن يحتاج نموذجك إلى تنسيق مختلف!
+```text
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+يجب بالطبع اختيار الرموز المحددة ووصف الأدوات التي يُعرضها قالبك لتتناسب مع تلك التي تم تدريب نموذجك عليها. لا يوجد شرط أن يفهم نموذجك مُدخلات مخطط JSON، فقط أن يتمكن قالبك من ترجمة مخطط JSON إلى تنسيق نموذجك. على سبيل المثال، تم تدريب Command-R باستخدام أدوات مُعرّفة باستخدام رؤوس دوال Python، ولكن يقبل قالب أداة Command-R مخطط JSON، ويُحوّل الأنواع داخليًا ويُعرض أدوات الإدخال كعناوين Python. يمكنك فعل الكثير باستخدام القوالب!
+
+#### استدعاءات الأدوات
+استدعاءات الأدوات، إذا كانت موجودة، ستكون قائمة مُرفقة برسالة بدور "assistant". لاحظ أن tool_calls هي دائمًا قائمة، على الرغم من أن معظم نماذج استدعاء الأدوات تدعم فقط استدعاءات أدوات فردية في كل مرة، مما يعني أن القائمة ستحتوي عادةً على عنصر واحد فقط. هنا قاموس رسالة نموذجي يحتوي على استدعاء أداة:
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+والنمط الشائع للتعامل معها سيكون كهذا:
+
+```text
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+مرة أخرى، يجب عليك عرض استدعاء الأداة بالتنسيق والرموز الخاصة التي يتوقعها نموذجك.
+
+#### استجابات الأدوات
+استجابات الأدوات لها تنسيق بسيط: إنها قاموس رسالة بدور "tool"، ومفتاح "name" يُعطي اسم الدالة المُستدعاة، ومفتاح "content" يحتوي على نتيجة استدعاء الأداة. هنا استجابة أداة نموذجية:
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+لست بحاجة إلى استخدام جميع المفاتيح في استجابة الأداة. على سبيل المثال، إذا كان نموذجك لا يتوقع تضمين اسم الدالة في استجابة الأداة، فيمكن أن يكون عرضها بسيطًا مثل:
+
+```text
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+مرة أخرى، تذكر أن التنسيق الفعلي والرموز الخاصة خاصة بالنموذج - يجب أن تُولي عناية كبيرة لضمان أن الرموز والمسافات الفارغة وكل شيء آخر يتطابق تمامًا مع التنسيق الذي تم تدريب نموذجك عليه!
diff --git a/docs/transformers/docs/source/ar/community.md b/docs/transformers/docs/source/ar/community.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a1c31de0aaa3f7e8c2399a016a59be982d17960
--- /dev/null
+++ b/docs/transformers/docs/source/ar/community.md
@@ -0,0 +1,66 @@
+# مجتمع المطورين
+
+هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع.
+
+## موارد المجتمع:
+
+| المصدر     |      الوصف      |      المؤلف      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## دفاتر ملاحظات المجتمع:
+
+| الدفتر     |      الوصف      |      المؤلف      |      |
+|:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
+| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
diff --git a/docs/transformers/docs/source/ar/conversations.md b/docs/transformers/docs/source/ar/conversations.md
new file mode 100644
index 0000000000000000000000000000000000000000..00e6fe814ea02913f6cf125ed301b1b0a57724d5
--- /dev/null
+++ b/docs/transformers/docs/source/ar/conversations.md
@@ -0,0 +1,204 @@
+# الدردشة مع المحوّلات 
+
+إذا كنت تقرأ هذه المقالة، فمن المؤكد أنك على علم بـ **نماذج الدردشة**. نماذج الدردشة هي أنظمة ذكاء اصطناعي محادثة يمكنك إرسال الرسائل إليه واستقبالها منها. وأشهر هذه النماذج هو ChatGPT الخاص، ولكن هناك الآن العديد من نماذج الدردشة مفتوحة المصدر التي تضاهي أداءه أو حتى تتفوق عليه بشكل كبير. هذه النماذج مجانية للتنزيل والتشغيل على جهاز محلي. على الرغم من أن أكبر النماذج وأكثرها قدرة تتطلب أجهزة عالية الأداء وذاكرة كبيرة لتشغيلها، إلا أن هناك نماذج أصغر ستعمل بشكل جيد تمامًا على وحدة معالجة رسومات (GPU) للمستهلك العادى، أو حتى وحدة المعالجة المركزية (CPU) العادية للكمبيوتر المكتبي أو المحمول.
+
+سيساعدك هذا الدليل على البدء في استخدام نماذج الدردشة. سنبدأ بدليل تشغيل سريع مختصر يستخدم "خط أنابيب" مناسبًا ومختصر. هذا كل ما تحتاجه إذا كنت تريد فقط بدء تشغيل نموذج دردشة على الفور. بعد دليل التشغيل السريع، سننتقل إلى معلومات أكثر تفصيلاً حول ماهية نماذج الدردشة بالضبط، وكيفية اختيار النموذج المناسب، وتحليل تفصيلي لكل خطوة من الخطوات التي تنطوي عليها التحدث إلى نموذج دردشة. كما سنقدم بعض النصائح حول تحسين أداء نموذج الدردشة واستهلاك الذاكرة.
+
+## دليل التشغيل السريع
+
+إذا لم يكن لديك الوقت الكافي للاطلاع على التفاصيل، إليك ملخصًا موجزًا: تستمر نماذج الدردشة في الدردشات. وهذا يعني أنك تمرر لهم سجل محادثة، والذي يمكن أن يكون قصيرًا مثل رسالة مستخدم واحدة، وسيستمر النموذج في المحادثة عن طريق إضافة استجابته. دعونا نرى هذا في العمل. أولاً، دعونا نبني دردشة:
+
+```python
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+```
+
+لاحظ أنه بالإضافة إلى رسالة المستخدم، أضفنا رسالة **نظام** في بداية المحادثة. ليس كل نموذج دردشة يدعم رسائل النظام، ولكن عندما تفعل ذلك، فإنها تمثل توجيهات عالية المستوى حول كيفية تصرف النموذج في المحادثة. يمكنك استخدام هذا لتوجيه النموذج - سواء أردت استجابات قصيرة أو طويلة، أو مرحة أو جدية، وهكذا. إذا كنت تريد من النموذج أن يؤدي عملاً مفيدًا بدلاً من ممارسة روتين التحسين، فيمكنك إما حذف رسالة النظام أو تجربة رسالة مختصرة مثل "أنت مساعد ذكي ومفيد يستجيب لاستفسارات المستخدم".
+
+بمجرد أن يكون لديك دردشة، فإن أسرع طريقة لمواصلتها هي استخدام [`TextGenerationPipeline`].
+
+دعونا نرى هذا في العمل مع `LLaMA-3`. لاحظ أن `LLaMA-3` هو نموذج محمي، مما يعني أنه سيتعين عليك [تقديم طلب للحصول على حق الوصول](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) وتسجيل الدخول باستخدام حساب Hugging Face الخاص بك لاستخدامه. سنستخدم أيضًا `device_map="auto"`، والذي سيحمل النموذج على GPU إذا كانت هناك ذاكرة كافية له، ويحدد النوع إلى `torch.bfloat16` لتوفير الذاكرة:
+
+```python
+import torch
+from transformers import pipeline
+
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+وستحصل على:
+
+```النص
+(تنهد) أوه يا صديقي، هل تطلب مني النصيحة؟ ستحتاج إلى خريطة، يا صديقي! حسنًا، حسنًا، سأعطيك التفاصيل. لكن لا تقل إنني لم أحذرك، أنا مجرد روبوت، وليس مرشد سياحي!
+
+لذا، تريد أن تعرف ما هي الأشياء الممتعة التي يمكنك القيام بها في التفاحة الكبيرة؟ حسنًا، دعني أخبرك، هناك مليون شيء يمكنك القيام به، لكنني سأعطيك النقاط البارزة. أولاً، عليك أن ترى المعالم السياحية: تمثال الحرية، سنترال بارك، تايمز سكوير... أنت تعرف، فخاخ السياح المعتادة. ولكن إذا كنت تبحث عن شيء أكثر... غير عادي، فأنا أوصي بزيارة متحف الفن الحديث. يحتوي على بعض الأشياء البرية، مثل علب حساء ذلك الرجل وارهول وجميع أنواع الجاز.
+
+وإذا كنت تشعر بروح المغامرة، فاذهب في نزهة على الأقدام عبر جسر بروكلين. ولكن احترس من تلك الحمامات المزعجة، إنها مثل اللصوص الريشيين الصغار! (يضحك) هل فهمت؟ لصوص؟ آه، لا تبالي.
+
+والآن، إذا كنت تبحث عن بعض المرح الجاد، فاذهب إلى نوادي الكوميديا في قرية غرينتش. قد تلقي نظرة خاطفة على بعض الكوميديين الصاعدين... أو مجموعة من الطامحين يحاولون الوصول إلى الشهرة. (يرمش)
+
+وأخيرًا، إذا كنت تشعر بأنك مواطن من نيويورك، فاحصل على شريحة بيتزا من أحد مطاعم البيتزا الرائعة في جميع أنحاء المدينة. فقط لا تحاول طلب شريحة "بحجم الروبوت"، صدقني، لن ينتهي الأمر بشكل جيد. (يضحك)
+
+لذا، هذا هو يا صديقي! هذه هي نصيحتي الخبيرة بشأن ما يجب فعله في نيويورك. والآن، إذا سمحت لي، يجب أن أذهب للاهتمام ببعض الأمور. (يرمش)
+```
+
+يمكنك متابعة الدردشة عن طريق إضافة ردك الخاص إليها.
+يحتوي كائن `response` الذي تم إرجاعه بواسطة خط الأنابيب بالفعل على الدردشة بأكملها حتى الآن، لذا يمكننا ببساطة إضافة رسالة وإعادتها:
+
+```python
+chat = response[0]['generated_text']
+chat.append(
+    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+)
+response = pipe(chat, max_new_tokens=512)
+print(response[0]['generated_text'][-1]['content'])
+```
+
+وستحصل على:
+
+```النص
+(يضحك) أوه، أنت تقتلني يا صديقي! ألا تفهم، أليس كذلك؟ علب حساء وارهول هي مثل الفن، يا رجل!
+إنه مثل، لقد أخذ شيئًا عاديًا تمامًا، مثل علبة حساء، وحولها إلى تحفة فنية. إنه مثل، "ها أنا ذا، أنا مجرد علبة حساء، لكنني أيضًا عمل فني!"
+(بسخرية) أوه، نعم، أصلي جدًا، آندي.
+
+ولكن، كما تعلم، في الستينيات، كان الأمر بمثابة صفقة كبيرة. كان الناس حريصين على تحدي الوضع الراهن، وكان وارهول مثل ملك ذلك. لقد حول العادي إلى غير عادي.
+واسمح لي أن أخبرك، كان الأمر مثل تغيير اللعبة. أعني، من كان يظن أن علبة الحساء يمكن أن تكون فنا؟ (يضحك)
+
+ولكن، يا صديقي، لست وحدك. أعني، أنا مجرد روبوت، ولا أفهم ذلك أيضًا. (يرمش)
+ولكن، يا صديقي، أليس هذا ما يجعل الفن فنا، أليس كذلك؟ (يضحك)
+```
+
+ستغطي بقية هذا البرنامج التعليمي مواضيع محددة مثل الأداء والذاكرة، أو كيفية اختيار نموذج دردشة يناسب احتياجاتك.
+
+## اختيار نموذج الدردشة
+
+هناك عدد هائل من نماذج الدردشة المختلفة المتاحة على [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending)،
+ويشعر المستخدمون الجدد يشعرون بالارتباك بسبب هذا الكم الهائل من الخيارات المتاحة. لا تقلق من ذلك! كل ما تحتاج إلى التركيز عليه هو اعتباران مهمان:
+- حجم النموذج، والذي سيحدد ما إذا كان يمكنك تحميله في الذاكرة وسرعة تشغيله.
+- جودة ناتج الدردشة للنموذج.
+
+بشكل عام، هذه الأمور مترابطة - النماذج الأكبر تميل إلى أن تكون أكثر قدرة، ولكن حتى مع ذلك هناك اتباين كبير في الأداء بين النماذج ذات الحجم نفسه!
+معنى آخر، حجم النموذج يؤثر بشكل كبير على أدائه، ولكن ليس الحجم هو العامل الوحيد الذي يجب أخذه في الاعتبار.
+
+### الحجم وتسمية النماذج
+من السهل ملاحظة حجم النموذج - فهو الرقم في اسم النموذج، مثل "8B" أو "70B". هذا هو عدد
+**المعلمات** في النموذج. بدون التكميم، يجب أن تتوقع الحاجة إلى حوالي 2 بايت من الذاكرة لكل معلمة.
+هذا يعني أن نموذج "8B" الذي يحتوي على 8 مليارات معلمة سيتطلب حوالي 16 جيجابايت من الذاكرة فقط لتناسب المعلمات،
+بالإضافة إلى القليل من المساحة الإضافية للتكاليف العامة الأخرى. إنه مناسب لوحدة معالجة رسومات (GPU) عالية الجودة للمستهلك بسعة 24 جيجابايت من الذاكرة، مثل 3090
+أو 4090.
+بعض نماذج الدردشة هي نماذج "مزيج من الخبراء". قد يتم سرد أحجام هذه النماذج بطرق مختلفة، مثل "8x7B" أو
+"141B-A35B". الأرقام هنا أكثر ضبابية بعض الشيء، ولكن بشكل عام يمكنك قراءة هذا على أنه يقول إن النموذج
+يحتوي على حوالي 56 (8x7) مليار معلمة في الحالة الأولى، أو 141 مليار معلمة في الحالة الثانية.
+
+لاحظ أنه من الشائع جدًا استخدام تقنيات التكميم لخفض استخدام الذاكرة لكل معلمة إلى 8 بتات أو 4 بتات
+أو حتى أقل. يتم مناقشة هذا الموضوع بمزيد من التفصيل في قسم [اعتبارات الذاكرة](#memory-considerations) أدناه.
+
+### ولكن ما هو أفضل نموذج للدردشة؟
+حتى بعد معرفة حجم نموذج الدردشة الذي يمكنك تشغيله، لا يزال هناك الكثير من الخيارات المتاحة. إحدى الطرق للتنقل في
+كل هذا هو استشارة **لوحات الصدارة**. اثنان من أكثر لوحات الصدارة شهرة هما [OpenLLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+و [LMSys Chatbot Arena Leaderboard](https://chat.lmsys.org/?leaderboard). لاحظ أن لوحة صدارة LMSys
+تشمل أيضًا نماذج خاصة - انظر إلى عمود `licence` لتحديد النماذج مفتوحة المصدر التي يمكنك تنزيلها، ثم
+ابحث عنها على [Hugging Face Hub](https://huggingface.co/models?pipeline_tag=text-generation&sort=trending).
+
+### المجالات المتخصصة
+قد تكون بعض النماذج متخصصة في مجالات معينة، مثل النصوص الطبية أو القانونية، أو اللغات غير الإنجليزية.
+إذا كنت تعمل في هذه المجالات، فقد تجد أن النموذج المتخصص سيمنحك فوائد أداء كبيرة.
+لا تفترض ذلك تلقائيًا! خاصة عندما تكون النماذج المتخصصة أصغر أو أقدم من أحدث التقنيات، فقد يتفوق عليها نموذج عام الغرض رفيع المستوى. لحسن الحظ، بدأنا نرى
+[لوحات الصدارة المتخصصة في المجال](https://huggingface.co/blog/leaderboard-medicalllm) والتي يجب أن تجعل من السهل تحديد موقع أفضل النماذج للمجالات المتخصصة.
+
+## ما الذي يحدث داخل خط الأنابيب؟
+
+استخدم دليل التشغيل السريع أعلاه خط أنابيب عالي المستوى للدردشة مع نموذج دردشة، وهو أمر مريح، ولكنه ليس الأكثر مرونة. دعونا نتخذ نهجًا منخفض المستوى، لكي نرى كل خطوة من الخطوات التي تنطوي عليها الدردشة. دعونا نبدأ
+بعينة من التعليمات البرمجية، ثم نقوم بتفكيكها:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# إعداد الإدخال كما هو الحال من قبل
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+
+# 1: تحميل النموذج والمحلل
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+
+# 2: تطبيق قالب الدردشة
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+print("Formatted chat:\n", formatted_chat)
+
+# 3: تحليل الدردشة (يمكن دمج هذه الخطوة مع الخطوة السابقة باستخدام tokenize=True)
+inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
+# نقل المدخلات المحللة إلى نفس الجهاز الموجود عليه النموذج (GPU/CPU)
+inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
+print("Tokenized inputs:\n", inputs)
+
+# 4: إنشاء نص من النموذج
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
+print("Generated tokens:\n", outputs)
+
+# 5: فك تشفير الإخراج مرة أخرى إلى سلسلة
+decoded_output = tokenizer.decode(outputs[0][inputs['input_ids'].size(1):], skip_special_tokens=True)
+print("Decoded output:\n", decoded_output)
+```
+
+هناك الكثير هنا، ويمكن أن تكون كل قطعة وثيقة خاصة بها! بدلاً من الدخول في الكثير من التفاصيل، سأغطي
+الأفكار العامة، وأترك التفاصيل للوثائق المرتبطة بها. الخطوات الرئيسية هي:
+1. يتم تحميل [النماذج](https://huggingface.co/learn/nlp-course/en/chapter2/3) و [المُجزّئات اللغوية](https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt) من Hugging Face Hub.
+2. يتم تنسيق الدردشة باستخدام [قالب الدردشة](https://huggingface.co/docs/transformers/main/en/chat_templating) للمحلل
+3. يتم [تحليل](https://huggingface.co/learn/nlp-course/en/chapter2/4) الدردشة المنسقة باستخدام مُجزّئ اللغوي.
+4. نقوم [بتوليد](https://huggingface.co/docs/transformers/en/llm_tutorial) استجابة من النموذج.
+5. يتم فك تشفير الرموز التي ينتجها النموذج مرة أخرى إلى سلسلة
+
+## الأداء والذاكرة والأجهزة
+
+من المحتمل أنك تعرف الآن أن معظم مهام التعلم الآلي يتم تشغيلها على وحدات معالجة الرسومات (GPU). ومع ذلك، من الممكن تمامًا
+إنشاء نص من نموذج دردشة أو نموذج لغة على وحدة المعالجة المركزية (CPU)، على الرغم من أن ذلك أبطأ إلى حد ما. إذا كان بإمكانك وضع
+النموذج في ذاكرة وحدة معالجة الرسومات (GPU)، فهذا عادة ما يكون الخيار المفضل.
+
+### اعتبارات الذاكرة
+
+بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "torch_dtype" كما فعلنا أعلاه.
+
+ومن الممكن أيضًا النزول إلى أقل من 16 بت باستخدام "التكميم"، وهي طريقة لضغط أوزان النموذج بطريقة تفقد بعض المعلومات. يسمح هذا بضغط كل معلمة إلى 8 بتات أو 4 بتات أو حتى أقل. لاحظ أنه، خاصة في 4 بتات، قد تتأثر جودة ناتج النموذج سلبًا، ولكن غالبًا ما يكون هذا مقايضة تستحق القيام بها لتناسب نموذج محادثة أكبر وأكثر قدرة في الذاكرة. دعنا كيف يمكننا تطبيق ذلك باستخدام مكتبة `bitsandbytes`:
+
+```python
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True) # يمكنك أيضًا تجربة load_in_4bit
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", quantization_config=quantization_config)
+```
+
+أو يمكننا القيام بنفس الشيء باستخدام واجهة برمجة التطبيقات "pipeline":
+
+```python
+from transformers import pipeline, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True) # يمكنك أيضًا تجربة load_in_4bit
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+```
+
+هناك عدة خيارات أخرى لكمية نماذج بخلاف `bitsandbytes` - يرجى الاطلاع على [دليل التكميم](./quantization) لمزيد من المعلومات.
+
+### اعتبارات الأداء
+
+<Tip>
+
+للحصول على دليل أكثر شمولاً حول أداء نموذج اللغة والتحسين، راجع [تحسين استدلال LLM](./llm_optims).
+
+</Tip>
+
+
+كقاعدة عامة، ستكون نماذج المحادثة الأكبر حجمًا أبطأ في توليد النصوص بالإضافة إلى احتياجها لذاكرة أكبرة. من الممكن أن تكون أكثر تحديدًا بشأن هذا: إن توليد النص من نموذج دردشة أمر غير عادي في أنه يخضع لقيود **سعة الذاكرة** بدلاً من قوة الحوسبة، لأن كل معلمة نشطة يجب قراءتها من الذاكرة لكل رمز ينشئه النموذج. وهذا يعني أن عدد الرموز في الثانية التي يمكنك توليدها من نموذج الدردشة يتناسب بشكل عام مع إجمالي حجم الذاكرة التي بوجد بها ا، مقسومًا على حجم النموذج.
+
+في مثالنا السريع أعلاه، كان حجم نموذجنا حوالي 16 جيجابايت عند تحميله في دقة "bfloat16". وهذا يعني أنه يجب قراءة 16 جيجابايت من الذاكرة لكل رمز ينشئه النموذج. يمكن أن يتراوح إجمالي سعة الذاكرة من 20-100 جيجابايت/ثانية لمعالجات المستهلكين إلى 200-900 جيجابايت/ثانية لمعالجات الرسومات للمستهلكين، ومعالجات Intel Xeon أو AMD Threadripper/Epyc أو Apple Silicon المتخصصةة، وأخيرًا يصل إلى 2-3 تيرابايت/ثانية لمعالجات مراكز البيانات مثل Nvidia A100 أو H100. يجب أن يعطيك هذا فكرة جيدة عن سرعة التوليد التي يمكنك توقعها من هذه الأنواع المختلفة من الأجهزة.
+
+لذلك، إذا كنت تريد تحسين سرعة توليد النص، فإن الحل الأسهل هو إما تقليل حجم النموذج في الذاكرة (عادةً عن طريق التكميم)، أو الحصول على عتاد بسرعة أكبر في الذاكرة. بالنسبة للمستخدمين المتقدمين، هناك عدة تقنيات أخرى للتغلب على هذه القيود. الأكثر شيوعًا هي المتغيرات على [التوليد بمساعدة](https://huggingface.co/blog/assisted-generation)، المعروف أيضًا باسم "العينات التخمينية (speculative sampling)". تحاول هذه التقنيات تخمين عدة رموز مستقبلية في وقت واحد، غالبًا باستخدام نموذج "مسودة (draft model)" أصغر، ثم تأكيد هذه التوليدات باستخدام نموذج الدردشة. إذا تم التحقق من صحة التخمينات بواسطة نموذج الدردشة، فيمكن إنشاء أكثر من رمز واحد لكل تمرير للأمام، مما يخفف بشكل كبير من القيود المتعلقة بالسعة ويحسن سرعة التوليد.
+
+أخيرًا، يجب أن نلاحظ أيضًا تأثير نماذج "مزيج الخبراء" "Mixture of Experts" (MoE) هنا. العديد من نماذج المحادثة الشهيرة، مثل Mixtral وQwen-MoE وDBRX، هي نماذج MoE. في هذه النماذج، لا تكون كل معلمة نشطة لكل رمز يتم إنشاؤه. ونتيجة لذلك، فإن نماذج MoE لديها عمومًا متطلبات ذاكرة أقل بكثير، على الرغم من أن حجمها الإجمالي يمكن أن يكون كبيرًا جدًا. لذلك يمكن أن تكون أسرع عدة مرات من نموذج "كثيف" عادي بنفس الحجم. ومع ذلك، فإن التقنيات مثل التوليد المساعد غير فعالة بشكل عام لهذه النماذج لأن المزيد من المعلمات ستصبح نشطة مع كل رمز جديد يتم التكهن به، والذي سيبطل فوائد السعة والسرعة التي توفرها بنية MoE.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/create_a_model.md b/docs/transformers/docs/source/ar/create_a_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b511fe0de4a8addbaf4e269e828ea3966580729
--- /dev/null
+++ b/docs/transformers/docs/source/ar/create_a_model.md
@@ -0,0 +1,436 @@
+# إنشاء بنية مخصصة
+
+تحدد فئة [`AutoClass`](model_doc/auto) تلقائيًا بنية النموذج وتقوم بتنزيل تكوين وأوزان مسبقين للنموذج. بشكل عام، نوصي باستخدام  `AutoClass` لإنتاج كود غير مرتبط بنسخة معينة. ولكن يمكن للمستخدمين الذين يريدون مزيدًا من التحكم في معلمات النموذج المحددة إنشاء نموذج مخصص من 🤗 Transformers من مجرد بضع فئات أساسية. قد يكون هذا مفيدًا بشكل خاص لأي شخص مهتم بدراسة نموذج 🤗 Transformers أو تدريبه أو إجراء تجارب عليه. في هذا الدليل، سنغوص بشكل أعمق في إنشاء نموذج مخصص بدون `AutoClass`. تعرف على كيفية:
+
+- تحميل تكوين النموذج وتخصيصه.
+- إنشاء بنية نموذج.
+- إنشاء مجزء لغوى سريع وبطيء للنص.
+- إنشاء معالج صور لمهام الرؤية.
+- إنشاء مستخرج ميزات لمهام الصوت.
+- إنشاء معالج للمهام متعددة الوسائط.
+
+## التكوين
+
+يشير مصطلح [التكوين](main_classes/configuration) إلى الخصائص المحددة للنموذج. لكل تكوين نموذج خصائصه الخاصة؛ على سبيل المثال، تشترك جميع نماذج NLP في الخصائص  `hidden_size` و`num_attention_heads` و`num_hidden_layers` و`vocab_size` المشتركة. تحدد هذه الخصائص عدد رؤوس الانتباه أو الطبقات المخفية لبناء نموذج بها.
+
+اطلع على [DistilBERT](model_doc/distilbert) من خلال [`DistilBertConfig`] لمعاينة خصائصه:
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+يعرض [`DistilBertConfig`] جميع الخصائص الافتراضية المستخدمة لبناء نموذج [`DistilBertModel`] أساسي. جميع الخصائص قابلة للتعديل، مما ييتيح مجالاً للتجريب. على سبيل المثال، يمكنك تعديل نموذج افتراضي لـ:
+
+- تجربة دالة تنشيط مختلفة باستخدام معامل `activation`.
+- استخدام معدل إسقاط أعلى الاحتمالات الانتباه مع معامل  `attention_dropout`.
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+ 
+```
+
+يمكن تعديل خصائص النموذج المدرب مسبقًا في دالة [`~PretrainedConfig.from_pretrained`] :
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+بمجرد أن تصبح راضيًا عن تكوين نموذجك، يمكنك حفظه باستخدام [`~PretrainedConfig.save_pretrained`]. يتم تخزين ملف التكوين الخاص بك على أنه ملف JSON في دليل الحفظ المحدد:
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+لإعادة استخدام ملف التكوين، قم بتحميله باستخدام [`~PretrainedConfig.from_pretrained`]:
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+```
+
+<Tip>
+يمكنك أيضًا حفظ ملف التكوين كقاموس أو حتى كفرق بين خصائص التكوين المُعدّلة والخصائص التكوين الافتراضية! راجع وثائق [التكوين](main_classes/configuration) لمزيد من التفاصيل.
+</Tip>
+
+
+## النموذج
+
+الخطوة التالية هي إنشاء [نموذج](main_classes/models). النموذج - ويُشار إليه أحيانًا باسم البنية - يُحدد وظيفة كل طبقة والعمليات الحسابية المُنفذة. تُستخدم خصائص مثل `num_hidden_layers` من التكوين لتحديد هذه البنية. تشترك جميع النماذج في  فئة أساسية واحدة هي [`PreTrainedModel`] وبعض الوظائف المُشتركة مثل غيير حجم مُدخلات الكلمات وتقليص رؤوس آلية الانتباه الذاتي. بالإضافة إلى ذلك، فإن جميع النماذج هي  فئات فرعية إما من [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)، [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) أو [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) . هذا يعني النماذج متوافقة مع كل استخدام لإطار العمل الخاص بها.
+
+<frameworkcontent>
+<pt>
+قم بتحميل خصائص التكوين المخصصة الخاصة بك في النموذج:
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان  المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه.  تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب.
+
+قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~PreTrainedModel.from_pretrained`]:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+عند بتحميل الأوزان المُدربة مسبقًا، يتم تحميل تكوين النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - سإعدادات النموذج الافتراضية بإعداداتك الخاصة:
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config)
+```
+</pt>
+<tf>
+قم بتحميل خصائص التكوين المُخصصة الخاصة بك في النموذج:
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+هذا ينشئ نموذجًا بقيم عشوائية بدلاً من الأوزان المُدربة مسبقًا. لن يكون هذا النموذج مفيدًا حتى يتم تدريبه. تُعد عملية التدريب مكلفة وتستغرق وقتًا طويلاً. من الأفضل بشكل عام استخدام نموذج مُدرب مسبقًا للحصول على نتائج أفضل بشكل أسرع، مع استخدام جزء بسيط فقط من الموارد المطلوبة للتدريب.
+
+قم بإنشاء نموذج مُدرب مسبقًا باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+عندما تقوم بتحميل الأوزان المُدربة مسبقًا،يتم تحميل إعدادات النموذج الافتراضي تلقائيًا إذا كان النموذج من مكتبة 🤗 Transformers. ومع ذلك، يمكنك أيضًا استبدال - بعض أو كل - إعدادات النموذج  الافتراضية بإعداداتك الخاصة:
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased"، config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### رؤوس النموذج
+
+في هذه المرحلة، لديك نموذج DistilBERT الأساسي الذي يخرج *حالات الكامنة*. تُمرَّر هذه الحالات الكامنة كمدخلات لرأس النموذج لإنتاج  المخرجات النهائية. توفر مكتبة 🤗 Transformers رأس نموذج مختلف لكل مهمة طالما أن النموذج يدعم المهمة (أي لا يمكنك استخدام DistilBERT لمهمة تسلسل إلى تسلسل مثل الترجمة).
+
+<frameworkcontent>
+<pt>
+على سبيل المثال، [`DistilBertForSequenceClassification`] هو نموذج DistilBERT الأساس  مزودًا برأس تصنيف تسلسلي.  يُشكّل رأس التصنيف التسلسلي طبقة خطية فوق المخرجات المجمعة.
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام هذا نقطة التحقق هذه لمهمة أخرى بسهولة، وذلك بتغيير رأس النموذج.ففي مهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`DistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية فوق مخرجات الحالات الكامنة.
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+</pt>
+<tf>
+على سبيل المثال، [`TFDistilBertForSequenceClassification`] هو نموذج DistilBERT الأساسي برأس تصنيف تسلسل. رأس التصنيف التسلسلي هو طبقة خطية أعلى المخرجات المجمعة.
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أعد استخدام هذا نقطة التحقق لمهمة أخرى عن طريق التبديل إلى رأس نموذج مختلف. لمهمة الإجابة على الأسئلة، ستستخدم رأس النموذج [`TFDistilBertForQuestionAnswering`]. رأس الإجابة على الأسئلة مشابه لرأس التصنيف التسلسلي باستثناء أنه طبقة خطية أعلى حالات الإخراج المخفية.
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## مجزئ النصوص
+
+الفئة الأساسية الأخيرة التي تحتاجها قبل استخدام نموذج للبيانات النصية هي [مجزئ النصوص](main_classes/tokenizer) لتحويل النص الخام إلى تنسورات  (tensors). هناك نوعان من المحولات الرموز التي يمكنك استخدامها مع 🤗 Transformers:
+
+- [`PreTrainedTokenizer`]: تنفيذ Python لمجزئ النصوص.
+ - [`PreTrainedTokenizerFast`]: مجزئ النصوص من مكتبة [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) المُبنية على لغة Rust.  هذا النوع من المجزئات أسرع بكثير، خاصةً عند معالجة دفعات النصوص، وذلك بفضل تصميمه بلغة Rust.  كما يوفر مجزئ النصوص السريع طرقًا إضافية مثل *مخطط الإزاحة* الذي يُطابق الرموز بكلماتها أو أحرفها الأصلية.
+
+يدعم كلا النوعين من المجزئات طرقًا شائعة مثل الترميز وفك الترميز، وإضافة رموز جديدة، وإدارة الرموز الخاصة.
+
+<Tip warning={true}>
+
+لا يدعم كل نموذج  مجزئ النصوص سريع. الق نظرة على هذا [جدول](index#supported-frameworks) للتحقق مما إذا كان النموذج يحتوي على دعم  مجزئ النصوص سريع.
+
+</Tip>
+
+إذا دربت مجزئ النصوص خاص بك، فيمكنك إنشاء واحد من *قاموسك*:```
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt"، do_lower_case=False، padding_side="left")
+```
+
+من المهم أن تتذكر أن قاموس مجزئ النصوص المُخصص سيكون مختلفًا عن قاموس مجزئ النصوص نموذج مُدرّب مسبقًا. يجب عليك استخدام قاموس نموذج مُدرّب مسبقًا إذا كنت تستخدم نموذجًا مُدرّبًا مسبقًا، وإلا فلن تكون المدخلات ذات معنى. قم بإنشاء مجزئ النصوص باستخدام قاموس نموذج مُدرّب مسبقًا باستخدام فئة [`DistilBertTokenizer`]:
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+قم بإنشاء مجزئ نصوص سريع باستخدام فئة [`DistilBertTokenizerFast`]:
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+<Tip>
+افتراضيًا، سيحاول [`AutoTokenizer`] تحميل مجزئ نصوص سريع. يمكنك تعطيل هذا السلوك عن طريق تعيين `use_fast=False` في `from_pretrained`.
+</Tip>
+
+## معالج الصور
+
+يعالج معالج الصور بيانات الرؤية. وهو يرث من الفئة الأساسية [`~image_processing_utils.ImageProcessingMixin`].
+
+لبناء معالج صور خاص بالنموذج المستخدم، أنشئ مثلاً مُعالج  [`ViTImageProcessor`] افتراضيًا إذا كنت تستخدم [ViT](model_doc/vit) لتصنيف الصور:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> vit_extractor = ViTImageProcessor()
+>>> print(vit_extractor)
+ViTImageProcessor {
+  "do_normalize": true,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+إذا كنت لا تبحث عن أي تخصيص، فما عليك سوى استخدام طريقة `from_pretrained` لتحميل معلمات معالج الصور الافتراضية للنموذج.
+
+</Tip>
+
+عدل أيًا من معلمات [`ViTImageProcessor`] لإنشاء معالج الصور المخصص الخاص بك:
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTImageProcessor {
+  "do_normalize": false,
+  "do_resize": true,
+ "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+## العمود الفقري
+
+<div style="text-align: center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png">
+</div>
+
+تتكون نماذج رؤية الحاسب من جزء أساسي، وجزء وسيط، وجزء معالجة نهائي. يستخرج الجزء الأساسي الميزات من صورة الإدخال، ويجمع الجزء الوسيط هذه الميزات المستخرجة ويعززها،  ويُستخدم الجزء  النهائي للمهمة الرئيسية (مثل اكتشاف الأجسام). ابدأ عبتهيئة الجزء الأساسي في تكوين النموذج وحدد ما إذا كنت تريد تحميل أوزان مدربة مسبقًا أو أوزانًا عشوائية. بعد ذلك، يمكنك تمرير تكوين النموذج إلى جزء المعالجة النهائي.
+
+على سبيل المثال، لتحميل [ResNet](../model_doc/resnet) backbone في نموذج [MaskFormer](../model_doc/maskformer) مع رأس تجزئة مثيل:
+
+<hfoptions id="backbone">
+<hfoption id="pretrained weights">
+
+قم بتعيين `use_pretrained_backbone=True` لتحميل الأوزان المسبقة التدريب لـ ResNet للعمود الفقري.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+</hfoption>
+<hfoption id="random weights">
+
+قم بتعيين `use_pretrained_backbone=False` لتهيئة جزء ResNet الأساسي بشكل عشوائي.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=False) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+يمكنك أيضًا تحميل تكوين الجزء الأساسي بشكل منفصل، ثم تمريره إلى تكوين النموذج.```
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+backbone_config = ResNetConfig()
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+</hfoption>
+<hfoption id="timm backbone">
+
+يتم تحميل نماذج [timm](https://hf.co/docs/timm/index) داخل نموذج باستخدام `use_timm_backbone=True` أو باستخدام [`TimmBackbone`] و [`TimmBackboneConfig`].
+
+استخدم `use_timm_backbone=True` و `use_pretrained_backbone=True` لتحميل أوزان timm المُدرّبة مسبقًا للجزء الأساسي.
+
+```python
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=True, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+قم بتعيين `use_timm_backbone=True` و `use_pretrained_backbone=False` لتحميل عمود فقري timm مبدئي عشوائي.
+
+```python
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_pretrained_backbone=False, use_timm_backbone=True) # تكوين الجزء الأساسي والجزء الوسيط
+model = MaskFormerForInstanceSegmentation(config) # جزء المعالجة النهائي
+```
+
+يمكنك أيضًا تحميل تكوين الجزء الأساسي واستخدامه لإنشاء `TimmBackbone` أو تمريره إلى تكوين النموذج. سيتم تحميلأوزان الجزء الأساسي لـ Timm المُدرّبة مسبقًا افتراضيًا. عيّن `use_pretrained_backbone=False` لتحميل الأوزان المبدئية العشوائية.
+
+```python
+from transformers import TimmBackboneConfig, TimmBackbone
+
+backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=False)
+
+# قم بإنشاء مثيل من العمود الفقري
+backbone = TimmBackbone(config=backbone_config)
+
+# قم بإنشاء نموذج باستخدام عمود فقري timm
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+## مستخرج الميزات
+
+يقوم مُستخرج الميزات بمعالجة المدخلات الصوتية. يرث من فئة الأساس [`~feature_extraction_utils.FeatureExtractionMixin`]، وقد يرث أيضًا من فئة [`SequenceFeatureExtractor`] لمعالجة المدخلات الصوتية.
+
+للاستخدام، قم بإنشاء مستخرج ميزات مرتبط بالنموذج الذي تستخدمه. على سبيل المثال، قم بإنشاء مستخرج ميزات Wav2Vec2 الافتراضي إذا كنت تستخدم [Wav2Vec2](model_doc/wav2vec2) لتصنيف الصوت:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+<Tip>
+إذا لم تكن بحاجة لأي تخصيص، فاستخدم فقط طريقة `from_pretrained` لتحميل معلمات مستخرج الميزات الافتراضية للنموذج. 
+</Tip>
+
+قم بتعديل أي من معلمات [`Wav2Vec2FeatureExtractor`] لإنشاء مستخرج ميزات مخصص:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000، do_normalize=False)
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor"،
+  "feature_size": 1،
+  "padding_side": "right"،
+  "padding_value": 0.0،
+  "return_attention_mask": false،
+  "sampling_rate": 8000
+}
+```
+
+## المعالج
+
+بالنسبة للنماذج التي تدعم مهام الوسائط المتعددة، توفر مكتبة 🤗 Transformers فئة معالج تجمع بفاعلية فئات المعالجة مثل مستخرج الميزات ومقسّم الرموز في كائن واحد. على سبيل المثال، دعنا نستخدم [`Wav2Vec2Processor`] لمهمة التعرف الآلي على الكلام (ASR). تقوم مهمة ASR بتحويل الصوت إلى نص، لذلك ستحتاج إلى مستخرج ميزات ومقسّم رموز.
+
+قم بإنشاء مستخرج ميزات لمعالجة المدخلات الصوتية:
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+قم بإنشاء مقسّم رموز لمعالجة المدخلات النصية:
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+قم بدمج مستخرج الميزات ومقسّم الرموز في [`Wav2Vec2Processor`]:
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+باستخدام فئتين أساسيتين - التكوين والنموذج - بالإضافة إلى فئة معالجة مسبق (مقسّم رموز أو معالج صورة أو مستخرج ميزات أو معالج)، يمكنك إنشاء أي من النماذج التي تدعمها مكتبة 🤗 Transformers. يمكن تكوين كل من هذه الفئات الأساسية، مما يسمح لك باستخدام السمات المطلوبة. يمكنك بسهولة تهيئة نموذج للتدريب أو تعديل نموذج مدرب مسبقاً لإجراء ضبط دقيق.
diff --git a/docs/transformers/docs/source/ar/custom_models.md b/docs/transformers/docs/source/ar/custom_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..daaba5e54ee2f65282df6b6adecb6a7dbcdbd32d
--- /dev/null
+++ b/docs/transformers/docs/source/ar/custom_models.md
@@ -0,0 +1,323 @@
+# بناء نماذج مخصصة
+
+تم تصميم مكتبة 🤗 Transformers لتكون قابلة للتوسيع بسهولة. كل نموذج مُشفّر بالكامل في مجلد فرعي معين بالمستودع، دون أي تجريد، لذلك يمكنك بسهولة نسخ ملف النمذجة وتعديله وفقًا لاحتياجاتك.
+
+إذا كنت تُنشئ نموذجًا جديدًا تمامًا، فقد يكون من الأسهل البدء من الصفر. في هذا البرنامج التعليمي، سنُرِيك كيفية كتابة نموذج مخصص وتكوينه ليُستخدم داخل Transformers، وكيفية مشاركته مع المجتمع (مع الكود الذي يعتمد عليه) بحيث يمكن لأي شخص استخدامه، حتى إذا لم يكن موجودًا في مكتبة 🤗 Transformers. سنرى كيفية البناء على المحولات ونوسّع الإطار  باستخدام الأدوات التي يمكن استخدامها لتعديل سلوك الإطار (hooks) والتعليمات البرمجية المخصصة.
+
+سنوضح كل هذا من خلال نموذج ResNet، بتغليف فئة ResNet من
+[مكتبة timm](https://github.com/rwightman/pytorch-image-models) داخل [`PreTrainedModel`].
+
+## كتابة إعدادات مخصصة
+
+لنبدأ بكتابة إعدادات النموذج. إعدادات النموذج هو كائنٌ يحتوي على جميع المعلومات اللازمة لبنائه. كما سنرى لاحقًا، يتطلب النموذج كائن `config` لتهيئته، لذا يجب أن يكون هذا الكائن كاملاً.
+
+<Tip>
+
+تتبع النماذج في مكتبة `transformers` اتفاقية قبول كائن `config` في دالة  `__init__` الخاصة بها، ثم تمرر كائن `config` بالكامل إلى الطبقات الفرعية في النموذج، بدلاً من تقسيمه إلى معامﻻت متعددة. يؤدي كتابة نموذجك بهذا الأسلوب إلى كود أبسط مع "مصدر حقيقة" واضح لأي فرط معلمات، كما يسهل إعادة استخدام الكود من نماذج أخرى في `transformers`.
+
+</Tip>
+
+في مثالنا، سنعدّل بعض الوسائط في فئة ResNet التي قد نرغب في ضبطها. ستعطينا التكوينات المختلفة أنواع ResNets المختلفة الممكنة. سنقوم بتخزين هذه الوسائط بعد التحقق من صحته.
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+الأشياء الثلاثة المهمة التي يجب تذكرها عند كتابة تكوينك الخاص هي:
+
+- يجب أن ترث من `PretrainedConfig`،
+- يجب أن تقبل دالة  `__init__` الخاصة بـ `PretrainedConfig` أي معامﻻت إضافية kwargs،
+- يجب تمرير هذه المعامﻻت الإضافية إلى دالة `__init__` فى الفئة الأساسية الاعلى.
+
+يضمن الإرث حصولك على جميع الوظائف من مكتبة 🤗 Transformers، في حين أن القيدين التانى والثالث يأتيان من حقيقة أن `PretrainedConfig` لديه المزيد من الحقول أكثر من تلك التي تقوم بتعيينها. عند إعادة تحميل تكوين باستخدام طريقة `from_pretrained`، يجب أن يقبل تكوينك هذه الحقول ثم إرسالها إلى الفئة الأساسية الأعلى.
+
+تحديد `model_type` لتكوينك (هنا `model_type="resnet"`) ليس إلزاميًا، ما لم ترغب في
+تسجيل نموذجك باستخدام الفئات التلقائية (راجع القسم الأخير).
+
+مع القيام بذلك، يمكنك بسهولة إنشاء تكوينك وحفظه مثلما تفعل مع أي تكوين نموذج آخر في
+المكتبة. إليك كيفية إنشاء تكوين resnet50d وحفظه:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+سيؤدي هذا إلى حفظ ملف باسم `config.json` داخل مجلد `custom-resnet`. يمكنك بعد ذلك إعادة تحميل تكوينك باستخدام
+طريقة `from_pretrained`:
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+يمكنك أيضًا استخدام أي طريقة أخرى من فئة [`PretrainedConfig`]، مثل [`~PretrainedConfig.push_to_hub`] لتحميل تكوينك مباشرة إلى Hub.
+
+## كتابة نموذج مخصص
+
+الآن بعد أن أصبح لدينا تكوين ResNet، يمكننا المتابعة لإنشاء نموذجين: الأول يستخرج الميزات المخفية  من دفعة من الصور (مثل [`BertModel`]) والآخر مناسب لتصنيف الصور (مثل [`BertForSequenceClassification`]).
+
+ كما ذكرنا سابقًا، سنقوم ببناء نموذج مبسط لتسهيل الفهم في هذا المثال. الخطوة الوحيدة المطلوبة قبل كتابة هذه الفئة هي لربط أنواع وحدات البناء بفئات ذات وحدات بناء فعلية. بعد ذلك، يُعرّف النموذج من خلال التكوين عبر تمرير كل شيء إلى فئة `ResNet`:
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+بالنسبة للنموذج الذي سيصنف الصور، فإننا نغير فقط طريقة التقديم:
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+في كلتا الحالتين، لاحظ كيف نرث من `PreTrainedModel` ونستدعي مُهيئ الفئة الرئيسية باستخدام `config` (كما تفعل عند إنشاء وحدة `torch.nn.Module` عادية). ليس من الضروري تعريف `config_class` إلا إذا كنت ترغب في تسجيل نموذجك مع الفئات التلقائية (راجع القسم الأخير).
+
+<Tip>
+
+إذا كان نموذجك مشابهًا جدًا لنموذج داخل المكتبة، فيمكنك إعادة استخدام نفس التكوين مثل هذا النموذج.
+
+</Tip>
+
+يمكن لنموذجك أن يعيد أي شيء تريده، ولكن إعادة قاموس مثلما فعلنا لـ
+`ResnetModelForImageClassification`، مع تضمين الخسارة عند تمرير العلامات، سيجعل نموذجك قابلًا للاستخدام مباشرة داخل فئة [`Trainer`]. يعد استخدام تنسيق إخراج آخر أمرًا جيدًا طالما أنك تخطط لاستخدام حلقة تدريب خاصة بك أو مكتبة أخرى للتدريب.
+
+الآن بعد أن أصبح لدينا فئة النموذج، دعنا ننشئ واحدة:
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+يمكنك استخدام أي من طرق فئة [`PreTrainedModel`]، مثل [`~PreTrainedModel.save_pretrained`] أو
+[`~PreTrainedModel.push_to_hub`]. سنستخدم الثاني في القسم التالي، وسنرى كيفية دفع أوزان النموذج مع كود نموذجنا. ولكن أولاً، دعنا نحمل بعض الأوزان المُعلمة مسبقًا داخل نموذجنا.
+
+في حالة الاستخدام الخاصة بك، فمن المحتمل أن تقوم بتدريب نموذجك المخصص على بياناتك الخاصة. للانتقال بسرعة خلال هذا البرنامج التعليمي،
+سنستخدم الإصدار المُعلم مسبقًا من resnet50d. نظرًا لأن نموذجنا هو مجرد غلاف حوله، فمن السهل نقل هذه الأوزان:
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+الآن دعونا نرى كيفية التأكد من أنه عند قيامنا بـ [`~PreTrainedModel.save_pretrained`] أو [`~PreTrainedModel.push_to_hub`]، يتم حفظ كود النموذج.
+
+## تسجيل نموذج مع كود مخصص للفئات التلقائية
+
+إذا كنت تكتب مكتبة توسع 🤗 Transformers، فقد ترغب في توسيع الفئات التلقائية لتشمل نموذجك الخاص. يختلف هذا عن نشر الكود إلى Hub بمعنى أن المستخدمين سيحتاجون إلى استيراد مكتبتك للحصول على النماذج المخصصة (على عكس تنزيل كود النموذج تلقائيًا من Hub).
+
+ما دام تكوينك يحتوي على معامل  `model_type` مختلفة عن أنواع النماذج الحالية، وأن فئات نماذجك لديك لديها الخصائص الصحيحة `config_class`، فيمكنك ببساطة إضافتها إلى الفئات التلقائية مثل هذا:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+لاحظ أن الحجة الأولى المستخدمة عند تسجيل تكوينك المخصص لـ [`AutoConfig`] يجب أن تتطابق مع `model_type`
+من تكوينك المخصص، والحجة الأولى المستخدمة عند تسجيل نماذجك المخصصة لأي فئة نموذج تلقائي يجب
+أن تتطابق مع `config_class` من تلك النماذج.
+
+## إرسال الكود إلى Hub
+
+<Tip warning={true}>
+
+هذا API تجريبي وقد يكون له بعض التغييرات الطفيفة في الإصدارات القادمة.
+
+</Tip>
+
+أولاً، تأكد من تعريف نموذجك بالكامل في ملف `.py`. يمكن أن يعتمد على الاستيراد النسبي لملفات أخرى طالما أن جميع الملفات موجودة في نفس الدليل (لا ندعم الوحدات الفرعية لهذه الميزة حتى الآن). في مثالنا، سنحدد ملف `modeling_resnet.py` وملف `configuration_resnet.py` في مجلد باسم "resnet_model" في دليل العمل الحالي. يحتوي ملف التكوين على كود لـ `ResnetConfig` ويحتوي ملف النمذجة على كود لـ `ResnetModel` و`ResnetModelForImageClassification`.
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+يمكن أن يكون ملف `__init__.py` فارغًا، فهو موجود فقط حتى يتمكن Python من اكتشاف أن `resnet_model` يمكن استخدامه كموديل.
+
+<Tip warning={true}>
+
+إذا كنت تقوم بنسخ ملفات النمذجة من المكتبة، فسوف تحتاج إلى استبدال جميع الواردات النسبية في أعلى الملف
+لاستيرادها من حزمة `transformers`.
+
+</Tip>
+
+لاحظ أنه يمكنك إعادة استخدام (أو توسيع) تكوين/نموذج موجود.
+
+لمشاركة نموذجك مع المجتمع، اتبع الخطوات التالية: أولاً، قم باستيراد نموذج ResNet والتكوين من الملفات التي تم إنشاؤها حديثًا:
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+بعد ذلك، يجب عليك إخبار المكتبة بأنك تريد نسخ ملفات الكود الخاصة بهذه الكائنات عند استخدام طريقة `save_pretrained`
+وتسجيلها بشكل صحيح باستخدام فئة تلقائية (خاصة للنماذج)، ما عليك سوى تشغيل:
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+لاحظ أنه لا توجد حاجة لتحديد فئة تلقائية للتكوين (هناك فئة تلقائية واحدة فقط لها،
+[`AutoConfig`]) ولكن الأمر يختلف بالنسبة للنماذج. قد يكون نموذجك المخصص مناسبًا للعديد من المهام المختلفة، لذلك يجب
+تحديد أي من الفئات التلقائية هو الصحيح لنموذجك.
+
+<Tip>
+
+استخدم `register_for_auto_class()` إذا كنت تريد نسخ ملفات الكود. إذا كنت تفضل استخدام الكود على Hub من مستودع آخر،
+فلا تحتاج إلى استدعائه. في الحالات التي يوجد فيها أكثر من فئة تلقائية واحدة، يمكنك تعديل ملف `config.json` مباشرة باستخدام
+الهيكل التالي:
+
+```json
+"auto_map": {     
+	"AutoConfig": "<your-repo-name>--<config-name>",     
+	"AutoModel": "<your-repo-name>--<config-name>",
+	"AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
+},
+```
+
+</Tip>
+
+بعد ذلك، دعنا نقوم بإنشاء التكوين والنماذج كما فعلنا من قبل:
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:
+
+```bash
+huggingface-cli login
+```
+
+أو من دفتر ملاحظات:
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+يمكنك بعد ذلك الضغط على مساحة الاسم الخاصة بك (أو منظمة أنت عضو فيها) مثل هذا:
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+بالإضافة إلى أوزان النمذجة والتكوين بتنسيق json، فقد قام هذا أيضًا بنسخ ملفات النمذجة والتكوين `.py` في مجلد `custom-resnet50d` وتحميل النتيجة إلى Hub. يمكنك التحقق من النتيجة في هذا [مستودع النموذج](https://huggingface.co/sgugger/custom-resnet50d).
+
+راجع [البرنامج التعليمي للمشاركة](model_sharing) لمزيد من المعلومات حول طريقة الدفع إلى المحور.
+
+### استخدام نموذج مع كود مخصص
+
+يمكنك استخدام أي تكوين أو نموذج أو مقسم لغوي مع ملفات برمجة مخصصة في مستودعه باستخدام الفئات التلقائية و دالة `from_pretrained`.تُفحص جميع الملفات والرموز المرفوع إلى Hub بحثًا عن البرامج الضارة (راجع وثائق [أمان Hub](https://huggingface.co/docs/hub/security#malware-scanning) لمزيد من المعلومات)، ولكن يجب عليك مراجعة كود النموذج والمؤلف لتجنب تنفيذ التعليمات البرمجية الضارة على جهازك. لتفعيل نموذج يحتوي على شفرة برمجية مخصصة،  عيّن `trust_remote_code=True`:
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+يُنصح بشدة بتحديد رقم إصدار (commit hash) كـ `revision`  للتأكد من عدم تعديل مؤلف النموذج للشفرة لاحقًابإضافة أسطر ضارة  (إلا إذا كنت تثق تمامًا بمؤلفي النموذج):
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d"، trust_remote_code=True، revision=commit_hash
+)
+```
+
+لاحظ وجود زرّ لنسخ رقم إصدار بسهولة عند تصفح سجل التزامات مستودع النموذج على منصة Hugging Face.
diff --git a/docs/transformers/docs/source/ar/fast_tokenizers.md b/docs/transformers/docs/source/ar/fast_tokenizers.md
new file mode 100644
index 0000000000000000000000000000000000000000..539712969e813ffeeb47c0676320ec6255d20180
--- /dev/null
+++ b/docs/transformers/docs/source/ar/fast_tokenizers.md
@@ -0,0 +1,51 @@
+# استخدام مجزئيات النصوص من 🤗 Tokenizers
+
+يعتمد [`PreTrainedTokenizerFast`] على مكتبة [🤗 Tokenizers](https://huggingface.co/docs/tokenizers). يمكن تحميل المجزئات اللغويين الذين تم الحصول عليهم من مكتبة 🤗 Tokenizers ببساطة شديدة في 🤗 Transformers.
+
+قبل الدخول في التفاصيل، دعونا نبدأ أولاً بإنشاء مُجزىء لغوي تجريبي في بضع سطور:
+
+```python
+>>> from tokenizers import Tokenizer
+>>> from tokenizers.models import BPE
+>>> from tokenizers.trainers import BpeTrainer
+>>> from tokenizers.pre_tokenizers import Whitespace
+
+>>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+>>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+>>> tokenizer.pre_tokenizer = Whitespace()
+>>> files = [...]
+>>> tokenizer.train(files, trainer)
+```
+
+الآن لدينا مُجزىء لغوي مدرب على الملفات التي حددناها. يمكننا إما الاستمرار في استخدامه في وقت التشغيل هذا، أو حفظه في ملف JSON لإعادة استخدامه لاحقًا.
+
+## تحميل مُجزئ  النّصوص  مُباشرةً
+
+دعونا نرى كيف يمكننا الاستفادة من كائن (مُجزئ النصوص) في مكتبة 🤗 Transformers. تسمح فئة [`PreTrainedTokenizerFast`] سهولة إنشاء *tokenizer*، من خلال قبول كائن *المُجزئ النصوص*  مُهيّأ مُسبقًا كمعامل:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+يمكن الآن استخدام هذا الكائن مع جميع الطرق المُشتركة بين مُجزّئي النّصوص  لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ  النّصوص](main_classes/tokenizer) لمزيد من المعلومات.
+
+## التحميل من ملف JSON
+
+لتحميل مُجزّئ النص من ملف JSON، دعونا نبدأ أولاً بحفظ مُجزّئ النّصوص:
+
+```python
+>>> tokenizer.save("tokenizer.json")
+```
+
+يمكن تمرير المسار الذي حفظنا به هذا الملف إلى طريقة تهيئة [`PreTrainedTokenizerFast`] باستخدام المُعامل  `tokenizer_file`:
+
+```python
+>>> from transformers import PreTrainedTokenizerFast
+
+>>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+يمكن الآن استخدام هذا الكائن مع جميع الطرق التي تشترك فيها مُجزّئي  النّصوص لـ 🤗 Transformers! انتقل إلى [صفحة مُجزّئ النص](main_classes/tokenizer) لمزيد من المعلومات.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/gguf.md b/docs/transformers/docs/source/ar/gguf.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f3d94d45f29ed821bb2e44e1be25adb36d9bf6f
--- /dev/null
+++ b/docs/transformers/docs/source/ar/gguf.md
@@ -0,0 +1,89 @@
+# GGUF وتفاعلها مع المحولات
+
+تُستخدم صيغة ملف GGUF لتخزين النماذج للاستدلال باستخدام [GGML](https://github.com/ggerganov/ggml) والمكتبات الأخرى التي تعتمد عليه، مثل [llama.cpp](https://github.com/ggerganov/llama.cpp) أو [whisper.cpp](https://github.com/ggerganov/whisper.cpp) الشهيرة جدًا.
+
+إنها صيغة ملف [مدعومة من قبل Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) مع ميزات تسمح بالفحص السريع للموترات والبيانات الوصفية داخل الملف.
+
+تم تصميم تنسيق الملف هذا كـ "تنسيق ملف واحد" حيث يحتوي ملف واحد عادةً على كل من سمات التكوين ومفردات المجزىء اللغوي والخصائص الأخرى، بالإضافة إلى جميع الموترات التي سيتم تحميلها في النموذج. تأتي هذه الملفات بتنسيقات مختلفة وفقًا لنوع التكميم في الملف. نلقي نظرة موجزة على بعضها [هنا](https://huggingface.co/docs/hub/en/gguf#quantization-types).
+
+## الدعم داخل المحولات
+
+أضفنا القدرة على تحميل ملفات `gguf` داخل `المحولات` لتوفير قدرات تدريب/ضبط إضافية لنماذج gguf، قبل إعادة تحويل تلك النماذج إلى `gguf` لاستخدامها داخل نظام `ggml`. عند تحميل نموذج، نقوم أولاً بإلغاء تكميمه إلى fp32، قبل تحميل الأوزان لاستخدامها في PyTorch.
+
+> [!NOTE]
+> لا يزال الدعم تجريبيًا للغاية ونرحب بالمساهمات من أجل ترسيخه عبر أنواع التكميم وبنى النماذج.
+
+فيما يلي، بنيات النماذج وأنواع التكميم المدعومة:
+
+### أنواع التكميم المدعومة
+
+تُحدد أنواع التكميم المدعومة مبدئيًا وفقًا لملفات التكميم الشائعة التي تمت مشاركتها على Hub.
+
+- F32
+- F16
+- BF16
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q8_0
+- Q2_K
+- Q3_K
+- Q4_K
+- Q5_K
+- Q6_K
+- IQ1_S
+- IQ1_M
+- IQ2_XXS
+- IQ2_XS
+- IQ2_S
+- IQ3_XXS
+- IQ3_S
+- IQ4_XS
+- IQ4_NL
+
+> [!NOTE]
+> لدعم إلغاء تكميم gguf، يلزم تثبيت `gguf>=0.10.0`.
+
+### بنيات النماذج المدعومة
+
+في الوقت الحالي، بنيات النماذج المدعومة هي البنيات التي كانت شائعة جدًا على Hub، وهي:
+
+- LLaMa
+- Mistral
+- Qwen2
+- Qwen2Moe
+- Phi3
+- Bloom
+- Falcon
+- StableLM
+- GPT2
+- Starcoder2
+- T5
+
+## مثال الاستخدام
+
+لتحميل ملفات `gguf` في `transformers`، يجب تحديد معامل `gguf_file` فى دالة `from_pretrained` لكل من المُجزّئ اللغوية والنموذج. فيما يلي كيفية تحميل المُجزّئ اللغوي ونموذج، يمكن تحميلهما من نفس الملف:
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+```
+
+الآن لديك إمكانية الوصول إلى النسخة الكامل غير المكممة للنموذج في بيئة PyTorch، حيث يمكنك دمجه مع مجموعة كبيرة من الأدوات الأخرى.
+
+لإعادة التحويل إلى ملف `gguf`، نوصي باستخدام ملف [`convert-hf-to-gguf.py`](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) من llama.cpp.
+
+فيما يلي كيفية إكمال البرنامج النصي أعلاه لحفظ النموذج وإعادة تصديره مرة أخرى إلى `gguf`:
+
+```py
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
diff --git a/docs/transformers/docs/source/ar/glossary.md b/docs/transformers/docs/source/ar/glossary.md
new file mode 100644
index 0000000000000000000000000000000000000000..81753bad281b40ee0b94c0c319aec8a49d0f9b8d
--- /dev/null
+++ b/docs/transformers/docs/source/ar/glossary.md
@@ -0,0 +1,446 @@
+# قاموس المصطلحات
+
+يحدد هذا المسرد مصطلحات التعلم الآلي العامة و 🤗 Transformers لمساعدتك على فهم الوثائق بشكل أفضل.
+
+## A
+
+### قناع الانتباه (Attention Mask)
+
+قناع الانتباه هو مُدخل اختياري يستخدم عند تجميع التسلسلات معًا
+
+<Youtube id="M6adb1j2jPI"/>
+
+يشير هذا المُدخل إلى النموذج أى الرموز المميزة (tokens) التي يجب الانتباه إليها، وأيها لا ينبغي ذلك.
+
+على سبيل المثال، تأمّل هذين التسلسُلين :
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence_a = "This is a short sequence."
+>>> sequence_b = "This is a rather long sequence. It is at least longer than sequence A."
+
+>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+```
+
+لدى الإصدارات المشفرة أطوال مختلفة:
+
+```python
+>>> len(encoded_sequence_a), len(encoded_sequence_b)
+(8, 19)
+```
+
+لذلك، لا يمكننا وضعها معًا في نفس المصفوفة كما هي. يجب إضافة حشو إلى التسلسل الأول حتى يصل إلى طول التسلسل الثاني، أو يجب تقليص الثاني إلى طول الأول.
+
+في الحالة الأولى، يتم تمديد قائمة المعرفات بواسطة مؤشرات الحشو. يمكننا تمرير قائمة إلى المحلل اللغوي وطلب منه إضافة الحشو بهذه الطريقة:
+
+```python
+>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+```
+
+يمكننا أن نرى أنه تمت إضافة اصفار على يمين الجملة الأولى لجعلها بنفس طول الجملة الثانية:
+
+```python
+>>> padded_sequences["input_ids"]
+[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+```
+
+يمكن بعد ذلك تحويل هذا إلى مصفوفة في PyTorch أو TensorFlow. قناع الانتباه هو مصفوفة ثنائية تشير إلى
+موضع  المؤشرات المحشوه بحيث لا ينتبه إليها النموذج. بالنسبة إلى [`BertTokenizer`]`1` يشير إلى
+قيمة يجب الانتباه إليها، في حين يشير `0` إلى قيمة مبطنة. يُمكن إيجاد قناع الانتباه في القاموس الذي يُعيده مُجزِّئ النصوص (tokenizer) تحت المفتاح "attention_mask".
+```python
+>>> padded_sequences["attention_mask"]
+[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+```
+
+### نماذج الترميز التلقائي (autoencoding models)
+
+راجع [نماذج الترميز](#encoder-models) و [نمذجة اللغة المقنعة](#masked-language-modeling-mlm)
+
+### النماذج ذاتية الانحدار (Autoregressive Models)
+
+راجع [نمذجة اللغة السببية](#causal-language-modeling) و [نماذج فك التشفير](#decoder-models)
+
+## B
+
+### العمود الفقري (backbone)
+
+يُمثل العمود الفقري الشبكة العصبونية (الترميزات والطبقات) المسؤولة عن إخراج الحالات الخفية أو المُميزات الأولية. عادة ما يكون متصلاً بـ [رأس](#head) يستقبل المُميزات كمدخلات لإجراء تنبؤ. على سبيل المثال، يُعد النموذج [`ViTModel`] عمودًا فقريًا دون رأس مُحدد مُرفق به. يمكن أيضًا استخدام `ViTModel` كعمود فقري في نماذج أخرى, مثل [DPT](model_doc/dpt).
+
+## C
+
+### نمذجة اللغة السببية (أو التنبؤية) causal language modeling
+
+مهمة ما قبل التدريب يقوم فيها النموذج بقراءة النصوص بالترتيب ويتنبأ بالكلمة التالية. يتم ذلك عادةً من خلال قراءة الجملة كاملةً، ولكن مع استخدام قناع داخل النموذج لإخفاء الرموز المميزة اللاحقة في خطوة زمنية معينة.
+
+
+
+### قناة(channel)
+
+تتكون الصور الملونة من مزيج من القيم في ثلاث قنوات لونية: الأحمر والأخضر والأزرق (RGB) بينما تحتوي صور ذات التدرج رمادي على قناة واحدة فقط. في مكتبة 🤗 Transformers، يمكن أن تكون القناة اللونية البُعد الأول أو الأخير في مُصفوفة الصورة: [`n_channels`، `height`، `width`] أو [`height`، `width`، `n_channels`].
+
+### التصنيف الزمني التوصيلي connectionist temporal classification (CTC)
+
+خوارزمية تسمح للنموذج بالتعلم دون معرفة كيفية محاذاة المدخلات مع المخرجات بدقة؛ يحسب CTC توزيع جميع المخرجات المحتملة لمدخلات مُحددة ويختار المخرج الأكثر احتمالًا. تُستخدم CTC بشكل شائع في مهام التعرف على الكلام نظرًا لأن الكلام المنطوق لا يتوافق دائمًا بشكل مُباشر مع النص المكتوب، لأسباب مختلفة مثل معدلات الكلام المختلفة للمتكلم.
+
+### الالتفاف (Convolution)
+
+نوع من الطبقات في شبكة عصبية، حيث تُضرب مصفوفة الإدخال عُنصرًا بُعنصر بمصفوفة أصغر تُسمى (النواة أو المرشح) ويتم جمع القيم في مصفوفة جديدة. يُعرف هذا باسم عملية الالتفاف التي يتم تكرارها عبر مصفوفة الإدخال بأكملها. تُطبق كل عملية التفاف على جزء مُختلف من مصفوفة الإدخال. تُستخدم الشبكات العصبية الالتفافية (CNNs) بشكل شائع في رؤية الحاسوب.
+
+## D
+
+### التوازي على مستوى البيانات (DataParallel - DP)
+
+هي تقنية تُستخدم لتدريب النماذج على عدة وحدات معالجة رسومات (GPUs)، حيث يتم نسخ نفس إعداد التدريب عدة مرات، بحيث تتلقى كل نسخة شريحة مختلفة من البيانات يتم تنفيذ المعالجة بالتوازي ويتم مزامنة جميع الإعدادات في نهاية كل خطوة تدريب.
+
+تعرف على المزيد حول كيفية عمل DataParallel [هنا](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
+
+### معرفات مدخلات وحدة فك التشفير (decoder input IDs)
+
+هذا المدخل خاص بنماذج الترميز وفك التشفير، ويحتوي على معرفات الإدخال التي سيتم تغذيتها إلى وحدة فك التشفير.
+يجب استخدام هذه المدخلات لمهام التسلسل إلى التسلسل، مثل الترجمة أو التلخيص، وعادة ما يتم بناؤها بطريقة محددة لكل نموذج.
+
+تقوم معظم نماذج الترميز وفك التشفير (BART، T5) بإنشاء معرفات `decoder_input_ids` الخاصة بها من `labels`. في مثل هذه النماذج،
+يعد تمرير `labels` هو الطريقة المفضلة للتعامل مع التدريب.
+
+يرجى التحقق من وثائق كل نموذج لمعرفة كيفية تعاملها مع معرفات الإدخال هذه للتدريب على التسلسل إلى التسلسل.
+
+### نماذج فك التشفير (decoder models)
+
+يُشار إليها أيضًا باسم نماذج التنبؤية الذاتية، وتنطوي نماذج فك التشفير على مهمة ما قبل التدريب (تسمى نمذجة اللغة السببية) حيث يقرأ النموذج النصوص بالترتيب ويتعين عليه التنبؤ بالكلمة التالية. يتم ذلك عادةً عن طريق
+قراءة الجملة بأكملها مع قناع لإخفاء الرموز المميزة المستقبلية في خطوة زمنية معينة.
+
+<Youtube id="d_ixlCubqQw"/>
+### التعلم العميق deep learning (DL)
+خوارزميات التعلم الآلي التي تستخدم الشبكات العصبية متعددة الطبقات.
+
+## E
+
+### نماذج الترميز (encoder models)
+
+تُعرف أيضًا باسم نماذج الترميز التلقائي، وتأخذ نماذج الترميز إدخالًا (مثل النص أو الصور) وتحويلها إلى تمثيل رقمي مكثف يُطلق عليه الترميز. غالبًا ما يتم تدريب نماذج الترميز مسبقًا باستخدام تقنيات مثل [نمذجة اللغة المقنعة](#masked-language-modeling-mlm)، والتي تقوم بإخفاء أجزاء من تسلسل الإدخال وإجبار النموذج على إنشاء تمثيلات أكثر دلالة (فائدة ووضوحاً).
+
+<Youtube id="H39Z_720T5s"/>
+
+## F
+### استخراج الميزات (feature extraction)
+
+عملية اختيار وتحويل البيانات الأولية إلى مجموعة من الميزات الأكثر إفادة وفائدة لخوارزميات التعلم الآلي. بعض الأمثلة على استخراج الميزات تشمل تحويل النص الأولي/الخام إلى ترميزات الكلمات واستخراج ميزات مهمة مثل الحواف أو الأشكال من بيانات الصور/الفيديو.
+
+### تجزئة التغذية الأمامية (feed forward chunking)
+
+في كل وحدة الانتباه الباقية في المحولات، تلي طبقة الاهتمام الانتباه عادة طبقتان للتغذية الأمامية.
+حجم تضمين الطبقة الأمامية الوسيطة أكبر عادة من حجم المخفي للنموذج (على سبيل المثال، لـ
+`google-bert/bert-base-uncased`).
+بالنسبة لإدخال بحجم `[batch_size, sequence_length]`، يمكن أن تمثل الذاكرة المطلوبة لتخزين التضمينات الأمامية الوسيطة `[batch_size، sequence_length, config.intermediate_size]` جزءًا كبيرًا من استخدام الذاكرة. لاحظ مؤلفو (https://arxiv.org/abs/2001.04451)[Reformer: The Efficient Transformer] أنه نظرًا لأن الحساب مستقل عن بعد `sequence_length`، فإنه من المكافئ رياضيًا حساب تضمينات الإخراج الأمامية `[batch_size، config.hidden_size]_0, ..., [batch_size، `config_size]_n
+فردياً والتوصيل بها لاحقًا إلى `[batch_size, sequence_length, config.hidden_size]` مع `n = sequence_length`، والذي يتداول زيادة وقت الحساب مقابل تقليل استخدام الذاكرة، ولكنه ينتج عنه نتيجة مكافئة رياضيا.
+
+بالنسبة للنماذج التي تستخدم الدالة `[apply_chunking_to_forward]`، يحدد `chunk_size` عدد التضمينات يتم حساب الإخراج بالتوازي وبالتالي يحدد المقايضة بين حجم الذاكرة والتعقيد الوقت. إذا تم تعيين `chunk_size` إلى `0`، فلن يتم إجراء تجزئة التغذية الأمامية.
+
+
+### النماذج المضبوطة (finetuned models)
+
+الضبط الدقيق هو شكل من أشكال نقل التعلم، يتضمن أخذ نموذج مُدرّب مسبقًا، وتجميد أوزانه، واستبدال طبقة الإخراج برأس نموذج مُضاف حديثًا. يتم تدريب رأس النموذج على مجموعة البيانات المستهدفة.
+
+راجع البرنامج التعليمي [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training) لمزيد من التفاصيل، وتعرف على كيفية ضبط النماذج باستخدام 🤗 Transformers.
+
+## H
+
+### رأس النموذج (head)
+
+يشير رأس النموذج إلى الطبقة الأخيرة من الشبكة العصبية التي تقبل الحالات المخفية الخام/الأولية وتُسقطها على بُعد مختلف. يوجد رأس نموذج مختلف لكل مهمة.
+
+  * [`GPT2ForSequenceClassification`] هو رأس تصنيف تسلسل - طبقة خطية - أعلى نموذج [`GPT2Model`] الأساسي.
+  * [`ViTForImageClassification`] هو رأس تصنيف صورة - طبقة خطية أعلى حالة مخفية نهائية للرمز `CLS` - أعلى نموذج [`ViTModel`] الأساسي.
+  * [`Wav2Vec2ForCTC`] هو رأس نمذجة اللغة مع [CTC](#connectionist-temporal-classification-ctc) أعلى نموذج [`Wav2Vec2Model`] الأساسي.
+
+## I
+
+### رقعة الصور (image patch)
+
+"رقعة الصورة" في نماذج المحولات البصرية، تُقسم الصورة إلى أجزاء أصغر تسمى "رقعات". يتم تمثيل كل رقعة بشكل رقمي (تحويلها إلى مجموعة من الأرقام) ثم تُعالج كسلسلة من البيانات. يمكنك العثور على حجم الرُقعة patch_size - أو دقتها - في إعدادات النموذج.
+
+### الاستدلال (Inference)
+
+الاستدلال هو عملية تقييم نموذج على بيانات جديدة بعد اكتمال التدريب. راجع البرنامج التعليمي [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) لمعرفة كيفية إجراء الاستدلال باستخدام 🤗 Transformers.
+
+### معرفات الإدخال (input IDs)
+
+معرفات الإدخال هي غالبًا المعلمات المطلوبة الوحيدة التي يجب تمريرها إلى النموذج كإدخال. هذه المعرفات عبارة عن أرقام تمثل كل كلمة أو رمز في الجملة التي نريد أن يفهمها النموذج. بمعنى آخر، هي طريقة لترجمة الكلمات إلى أرقام يتم استخدامها كإدخال بواسطة النموذج.
+
+<Youtube id="VFp38yj8h3A"/>
+
+يعمل كل محلل لغوي بشكل مختلف ولكن الآلية الأساسية تبقى كما هي. إليك مثال باستخدام محلل BERT اللغوي، والذي يعد محلل لغوي [WordPiece](https://arxiv.org/pdf/1609.08144.pdf):
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence = "A Titan RTX has 24GB of VRAM"
+```
+
+يتولى المحلل اللغوي مهمة تقسيم التسلسل إلى رموز مميزة متوفرة في قاموس المحلل اللغوي.
+
+```python
+>>> tokenized_sequence = tokenizer.tokenize(sequence)
+```
+
+االرموز إما كلمات أو أجزاء كلمات. هنا على سبيل المثال، لم تكن كلمة "VRAM" موجودة في مفردات النموذج، لذلك تم تقسيمها إلى "V" و "RA" و "M". للإشارة إلى أن هذه الرموز ليست كلمات منفصلة ولكنها أجزاء من نفس الكلمة، تمت إضافة بادئة مزدوجة (#) إلى "RA" و "M":
+```python
+>>> print(tokenized_sequence)
+['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+```
+```python
+>>> print(tokenized_sequence)
+['A'، 'Titan'، 'R'، '##T'، '##X'، 'has'، '24'، '##GB'، 'of'، 'V'، '##RA'، '##M']
+```
+
+يمكن بعد ذلك تحويل هذه الرموز إلى مُعرفات يفهمها النموذج. يمكن القيام بذلك عن طريق تغذية الجملة مباشرةً إلى مُجزّئ الرموز، والذي يستفيد من تنفيذ 🤗 Tokenizers بلغة Rust للحصول على أعلى أداء.
+
+```python
+>>> inputs = tokenizer(sequence)
+```
+
+يقوم المحلل اللغوي بإرجاع قاموس يحتوي على جميع المعلومات التي يحتاجها النموذج للعمل بشكل صحيح. وتوجد مؤشرات الرموز المميزة تحت مفتاح `input_ids`:
+
+```python
+>>> encoded_sequence = inputs["input_ids"]
+>>> print(encoded_sequence)
+[101، 138، 18696، 155، 1942، 3190، 1144، 1572، 13745، 1104، 159، 9664، 2107، 102]
+```
+
+لاحظ أن المحلل اللغوي يضيف تلقائيًا "رموزًا خاصة" (إذا كان النموذج المرتبط يعتمد عليها) وهي معرفات خاصة
+يستخدمها النموذج في بعض الأحيان.
+
+إذا قمنا بفك تشفير التسلسل السابق،
+
+```python
+>>> decoded_sequence = tokenizer.decode(encoded_sequence)
+```
+
+سنرى
+
+```python
+>>> print(decoded_sequence)
+[CLS] A Titan RTX has 24GB of VRAM [SEP]
+```
+
+لأن هذه هي الطريقة التي يتوقع بها نموذج [`BertModel`] إدخالاته.
+
+## L
+
+### االملصقات (Labels)
+
+هي معامل اختياري يمكن إدخاله في النموذج لحساب الخسارة بنفسه.
+نماذج تصنيف التسلسل: ([BertForSequenceClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع للتسلسل بأكمله.
+نماذج تصنيف الرمز: ([BertForTokenClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size, seq_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي.
+نماذج النمذجة اللغوية المقنعة:([BertForMaskedLM]) يتوقع النموذج مصفوفة ذات بعد (batch_size, seq_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي: تكون الملصقات هي معرف رمز الكلمة المقنعة، والقيم الأخرى يتم تجاهلها (عادةً -100).
+مهام التسلسل إلى التسلسل: ([BartForConditionalGeneration], [MBartForConditionalGeneration]) يتوقع النموذج مصفوفة ذات بعد (batch_size, tgt_seq_length) حيث تتوافق كل قيمة مع التسلسل الهدف المرتبط بكل تسلسل مدخل. أثناء التدريب، سيقوم كل من BART و T5 بإنشاء decoder_input_ids و decoder attention masks داخليًا. عادةً لا يلزم توفيرها. هذا لا ينطبق على النماذج التي تستخدم إطار العمل Encoder-Decoder.
+نماذج تصنيف الصور: ([ViTForImageClassification]) يتوقع النموذج مصفوفة ذات بعد (batch_size) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع لكل صورة فردية.
+نماذج التقسيم الدلالي: ([SegformerForSemanticSegmentation]) يتوقع النموذج مصفوفة ذات بعد (batch_size, height, width) حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع لكل بكسل فردي.
+نماذج اكتشاف الأجسام: ([DetrForObjectDetection]) يتوقع النموذج قائمة من القواميس تحتوي على مفتاح class_labels و boxes حيث تتوافق كل قيمة من المجموعة مع الملصق المتوقع وعدد المربعات المحيطة بكل صورة فردية.
+نماذج التعرف التلقائي على الكلام: ([Wav2Vec2ForCTC]) يتوقع النموذج مصفوفة ذات بعد (batch_size, target_length) حيث تتوافق كل قيمة مع الملصق المتوقع لكل رمز فردي.
+
+<Tip>
+
+قد تختلف تسميات كل نموذج، لذا تأكد دائمًا من مراجعة وثائق كل نموذج للحصول على معلومات حول التسميات الخاصة به.
+
+</Tip>
+لا تقبل النماذج الأساسية ([`BertModel`]) الملصقات ، لأنها نماذج المحول الأساسية، والتي تقوم ببساطة بإخراج الميزات.
+
+### نماذج اللغة الكبيرة large language models (LLM)
+
+مصطلح عام يشير إلى نماذج اللغة المحولة (GPT-3 و BLOOM و OPT) التي تم تدريبها على كمية كبيرة من البيانات. تميل هذه النماذج أيضًا إلى وجود عدد كبير من المعلمات القابلة للتعلم (على سبيل المثال، 175 مليار لمعلمة GPT-3).
+
+## M
+
+### نمذجة اللغة المقنعة masked language modeling (MLM)
+
+مهمة تدريب مسبق حيث يرى النموذج نسخة تالفة من النصوص، وعادة ما يتم ذلك عن طريق حجب بعض الرموز بشكل عشوائي، ويتعين على النموذج التنبؤ بالنص الأصلي.
+
+### متعدد الوسائط (multimodal)
+
+مهمة تجمع بين النصوص مع نوع آخر من المدخلات (على سبيل المثال، الصور).
+
+## N
+
+### توليد اللغة الطبيعية Natural language generation (NLG)
+
+جميع المهام المتعلقة بتوليد النص (على سبيل المثال، [اكتب باستخدام المحولات](https://transformer.huggingface.co/)، والترجمة).
+
+### معالجة اللغة الطبيعية Natural language processing (NLP)
+
+طريقة عامة للقول "التعامل مع النصوص".
+
+### فهم اللغة الطبيعية Natural language understanding (NLU)
+
+جميع المهام المتعلقة بفهم ما هو موجود في نص (على سبيل المثال تصنيف النص بأكمله، أو الكلمات الفردية).
+
+## P
+
+### خط الأنابيب (pipeline)
+
+في مكتبة Transformers، يُشير مصطلح "خط الأنابيب" إلى سلسلة من الخطوات التي يتم تنفيذها بترتيب محدد لمعالجة البيانات وتحويلها وإرجاع تنبؤ من نموذج. بعض المراحل الشائعة في خط الأنابيب قد تشمل معالجة البيانات الأولية، واستخراج الميزات، والتوحيد.
+
+للحصول على مزيد من التفاصيل، راجع [خطوط الأنابيب للاستدلال](https://huggingface.co/docs/transformers/pipeline_tutorial).
+
+### التوازي على مستوى خط الأنابيب (PipelineParallel)
+
+تقنية توازي يتم فيها تقسيم النموذج رأسياً (على مستوى الطبقة) عبر وحدات معالجة الرسومات (GPU) متعددة، بحيث توجد طبقة واحدة أو عدة طبقات من النموذج على وحدة معالجة الرسومات (GPU) واحدة فقط. تقوم كل وحدة معالجة رسومات (GPU) بمعالجة مراحل مختلفة من خط الأنابيب بالتوازي والعمل على جزء صغير من الدفعة. تعرف على المزيد حول كيفية عمل PipelineParallel [هنا](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).
+
+### قيم البكسل (pixel values)
+
+مصفوفة من التمثيلات الرقمية لصورة يتم تمريرها إلى نموذج. تأخذ قيم البكسل شكل [`batch_size`، `num_channels`، `height`، `width`]، ويتم إنشاؤها من معالج الصور.
+
+### التجميع (Pooling)
+
+هي عملية تقوم بتقليص مصفوفة إلى مصفوفة أصغر، إما عن طريق أخذ القيمة القصوى أو المتوسط الحسابي للأبعاد التي يتم تجميعها. توجد طبقات التجميع بشكل شائع بين الطبقات التلافيفية convolutional layers لتقليل حجم تمثيل الميزات.
+
+### معرفات الموضع (position IDs)
+
+على عكس الشبكات العصبية المتكررة (RNNs) التي تتضمن موضع كل رمز (token) ضمن بنيتها، لا تدرك المحولات موضع كل رمز. لذلك، تستخدم معرفات الموضع (`position_ids`) من قبل النموذج لتحديد موضع كل رمز في قائمة الرموز.
+
+إنها معلمة اختيارية. إذا لم يتم تمرير أي `position_ids` إلى النموذج، يتم إنشاء المعرفات تلقائيًا كترميزات موضعية مطلقة.
+
+يتم اختيار الترميزات الموضعية المطلقة في النطاق `[0، config.max_position_embeddings - 1]`. تستخدم بعض النماذج أنواعًا أخرى من الترميزات الموضعية، مثل الترميزات الموضعية الجيبية أو الترميزات الموضعية النسبية.
+
+### ما قبل المعالجة (preprocessing) 
+
+مهمة إعداد البيانات الخام بتنسيق يمكن أن تستهلكه نماذج التعلم الآلي بسهولة. على سبيل المثال، عادةً ما تتم معالجة النص مسبقًا عن طريق التمييز. للحصول على فكرة أفضل عن كيفية ظهور المعالجة المسبقة لأنواع الإدخال الأخرى، راجع البرنامج التعليمي [Preprocess](https://huggingface.co/docs/transformers/preprocessing).
+
+### النموذج المسبق التدريب (pretrained model)
+
+نموذج تم تدريبه مسبقًا على بعض البيانات (على سبيل المثال، كل Wikipedia). تنطوي طرق التدريب المسبق على هدف ذاتي الإشراف، والذي يمكن أن يكون قراءة النص ومحاولة التنبؤ بالكلمة التالية ( راجع (causal-language-modeling#)[نمذجة اللغة السببية] ) أو قناع بعض الكلمات ومحاولة التنبؤ بها ( راجع (masked-language#)[نمذجة اللغة المقنعة]- عرض MLM).
+
+لدى نماذج الكلام والرؤية أهدافها التدريبية المسبقة الخاصة. على سبيل المثال، Wav2Vec2 هو نموذج كلام تم تدريبه مسبقًا على مهمة تباينية تتطلب من النموذج تحديد تمثيل الكلام "الحقيقي" من مجموعة من تمثيلات الكلام "الخاطئة". من ناحية أخرى، BEiT هو نموذج رؤية تم تدريبه مسبقًا على مهمة نمذجة صورة مقنعة تقوم بقناع بعض رقع الصورة وتتطلب من النموذج التنبؤ بالرقع المقنعة (مشابهة لهدف نمذجة اللغة المقيدة).
+
+## R
+
+### شبكة عصبية متكررة (RNN)
+
+هي نوع من النماذج التي تستخدم حلقة متكررة فوق طبقة معينة لمعالجة النصوص.
+
+### التعلم التمثيلي (representation learning)
+
+هو فرع من فروع تعلم الآلة يركز على تعلم تمثيلات ذات معنى للبيانات الخام. بعض الأمثلة على تقنيات التعلم التمثيلي تشمل تضمين الكلمات، والمشفرات ذاتية، وشبكات التنافس التوليدية(GANs).
+
+## S
+
+### معدل العينات (sampling rate)
+
+قياس، بالهرتز، لعدد العينات (إشارة الصوت) المأخوذة في الثانية. ينتج معدل العينات عن تمييز إشارة مستمرة مثل الكلام.
+
+### الانتباه الذاتي (Self-Attention)
+
+هو آلية تتيح لكل عنصر في المدخل أن يحدد أي العناصر الأخرى في نفس المدخل يجب أن ينتبه إليها.
+
+### التعلم الذاتي الخاضع للإشراف (supervised learning)
+
+فئة من تقنيات التعلم الآلي التي يقوم فيها النموذج بإنشاء هدفه التعليمي الخاص من البيانات غير الموسومة. يختلف عن [التعلم غير الخاضع للإشراف](#unsupervised-learning) و [التعلم الخاضع للإشراف](#supervised-learning) في أن عملية التعلم خاضعة للإشراف، ولكن ليس صراحة من المستخدم.
+
+مثال واحد على التعلم الذاتي الخاضع للإشراف هو [نمذجة اللغة المقيدة](#masked-language- عرض MLM)، حيث يتم تمرير جمل للنموذج مع إزالة نسبة من رموزه ويتعلم التنبؤ بالرموز المفقودة.
+
+### التعلم شبه الخاضع للإشراف (semi-supervised learning)
+
+فئة واسعة من تقنيات تدريب التعلم الآلي التي تستفيد من كمية صغيرة من البيانات الموسومة مع كمية أكبر من البيانات غير الموسومة لتحسين دقة النموذج، على عكس [التعلم الخاضع للإشراف](#supervised-learning) و [التعلم غير الخاضع للإشراف](#unsupervised-learning).
+
+مثال على نهج التعلم شبه الخاضع للإشراف هو "التدريب الذاتي"، حيث يتم تدريب نموذج على بيانات موسومة، ثم يستخدم لتقديم تنبؤات حول البيانات غير الموسومة. يتم إضافة الجزء من البيانات غير الموسومة التي يتنبأ بها النموذج بأكبر قدر من الثقة إلى مجموعة البيانات الموسومة ويتم استخدامها لإعادة تدريب النموذج.
+
+### تسلسل إلى تسلسل (seq2seq)
+
+نماذج تولد تسلسلًا جديدًا من إدخال، مثل نماذج الترجمة، أو نماذج التلخيص (مثل [Bart](model_doc/bart) أو [T5](model_doc/t5)).
+
+### Sharded DDP
+
+اسم آخر لمفهوم [Zero Redundancy Optimizer](#zero-redundancy-optimizer-zero) الأساسي كما هو مستخدم من قبل العديد من التطبيقات الأخرى لـ Zero.
+
+### الخطوة (Stride)
+
+في العمليات التلافيفية أو التجميعية، تشير الخطوة إلى المسافة التي يتحرك بها النواة (kernel) فوق المصفوفة. خطوة تساوي 1 تعني أن النواة تتحرك بكسل واحد في كل مرة.
+
+### التعلم الخاضع للإشراف (supervised learning)
+
+هو نوع من تدريب النماذج التي تستخدم بيانات مُعلَّمة بشكل مباشر لتصحيح أداء النموذج وتوجيهه. يتم تغذية البيانات إلى النموذج قيد التدريب، ويتم مقارنة تنبؤاته بالنتائج الصحيحة المعروفة. يقوم النموذج بتعديل أوزانه بناءً على مدى خطأ تنبؤاته، وتتكرر هذه العملية لتحسين أداء النموذج.
+
+## T
+
+### توازي Tensor (TP)
+
+تقنية توازي لتدريب وحدات معالجة الرسومات (GPU) متعددة يتم فيها تقسيم المصفوفة إلى عدة أجزاء، لذا بدلاً من وجود المصفوفة بأكملها على وحدة معالجة الرسومات (GPU) واحدة، توجد كل شظية من المصفوفة على وحدة معالجة الرسومات (GPU) المخصصة لها. تتم معالجة الشظايا بشكل منفصل وبالتوازي على وحدات معالجة الرسومات (GPU) المختلفة ويتم مزامنة النتائج في نهاية خطوة المعالجة. هذا ما يُطلق عليه أحيانًا التوازي الأفقي، حيث يحدث الانقسام على المستوى الأفقي.
+
+تعرف على المزيد حول توازي Tensor [هنا](perf_train_gpu_many#tensor-parallelism).
+
+### الرمز اللغوي (Token)
+
+جزء من جملة، عادة ما يكون كلمة، ولكن يمكن أن يكون أيضًا كلمة فرعية (غالبًا ما يتم تقسيم الكلمات غير الشائعة إلى كلمات فرعية) أو علامة ترقيم.
+
+### معرفات نوع الرمز (token type ids)
+
+الغرض من بعض النماذج هو إجراء التصنيف على أزواج من الجمل أو الإجابة على الأسئلة.
+
+<Youtube id="0u3ioSwev3s"/>
+
+يتطلب ذلك تسلسلين مختلفين يتم دمجهما في إدخال "input_ids" واحد، والذي يتم عادةً باستخدام رموز خاصة، مثل رموز التصنيف (`[CLS]`) والفاصل (`[SEP]`). على سبيل المثال، يقوم نموذج BERT ببناء إدخال تسلسلين على النحو التالي:
+
+```python
+>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+```
+
+يمكننا استخدام برنامجنا للتمييز لإنشاء مثل هذه الجملة تلقائيًا عن طريق تمرير التسلسلين إلى `tokenizer` كمعامليين (وليس قائمة، كما كان من قبل) مثل هذا:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+>>> sequence_a = "HuggingFace is based in NYC"
+>>> sequence_b = "Where is HuggingFace based?"
+
+>>> encoded_dict = tokenizer(sequence_a، sequence_b)
+>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+```
+
+والذي سيعيد:
+
+```python
+>>> print(decoded)
+[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based؟ [SEP]
+```
+
+هذا يكفي لبعض النماذج لفهم أين ينتهي تسلسل واحد وأين يبدأ الآخر. ومع ذلك، تستخدم نماذج أخرى، مثل BERT، أيضًا معرفات نوع الرمز (يُطلق عليها أيضًا معرفات الجزء). يتم تمثيلها كماسك ثنائي لتحديد نوعي التسلسل في النموذج.
+
+يعيد برنامج الترميز هذا القناع كإدخال "token_type_ids":
+
+```python
+>>> encoded_dict["token_type_ids"]
+[0، 0، 0، 0، 0، 0، 0، 0، 0، 0، 1، 1، 1، 1، 1، 1، 1، 1، 1]
+```
+
+يتم تمثيل التسلسل الأول، "السياق" المستخدم للسؤال، بجميع رموزه بواسطة `0`، في حين يتم تمثيل التسلسل الثاني، المقابل إلى "السؤال"، بجميع رموزه بواسطة `1`.
+
+تستخدم بعض النماذج، مثل [`XLNetModel`] رمزًا إضافيًا يمثله `2`.
+
+### التعلم الانتقالي (Transfer Learning)
+
+تقنية تنطوي على أخذ نموذج تم تدريبه مسبقًا وتكييفه مع مجموعة بيانات خاصة بمهمتك. بدلاً من تدريب نموذج من الصفر، يمكنك الاستفادة من المعرفة المكتسبة من نموذج موجود كنقطة بداية. يسرع هذا عملية التعلم ويقلل من كمية بيانات التدريب المطلوبة.
+
+### المحول (Transformer)
+
+هو بنية لنموذج تعلم عميق يعتمد على الانتباه الذاتي.
+
+## U
+
+### التعلم غير الخاضع للإشراف (unsupervised learning)
+
+شكل من أشكال تدريب النماذج حيث لا يتم وضع علامات على البيانات المقدمة إلى النموذج. تستفيد تقنيات التعلم غير الخاضعة للإشراف من المعلومات الإحصائية لتوزيع البيانات للعثور على الأنماط المفيدة للمهمة المعنية.
+
+## Z
+
+### محسن التكرار الصفري (ZeRO)
+
+تقنية توازي تقوم بتشظية المصفوفات بطريقة مشابهة لـ [TensorParallel](#tensor-parallelism-tp)، باستثناء إعادة بناء المصفوفة بالكامل في الوقت المناسب لحساب التقدير أو الحساب الخلفي، وبالتالي لا يلزم تعديل النموذج. تدعم هذه الطريقة أيضًا تقنيات الإخلاء المختلفة للتعويض عن ذاكرة GPU المحدودة.
+
+تعرف على المزيد حول Zero [هنا](perf_train_gpu_many#zero-data-parallelism).
diff --git a/docs/transformers/docs/source/ar/how_to_hack_models.md b/docs/transformers/docs/source/ar/how_to_hack_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ce3589732f06aee0de0bde51843af967f0e26a7
--- /dev/null
+++ b/docs/transformers/docs/source/ar/how_to_hack_models.md
@@ -0,0 +1,163 @@
+# كيفية تعديل أي نموذج من نماذج Transformers
+
+توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index).
+
+سنرشدك في هذا الدليل  لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية:
+
+- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به.
+- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة.
+
+نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1
+
+## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM)
+
+نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة.
+
+### الدافع
+
+من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي:
+
+- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي.
+- تحقيق أداء أفضل من خلال التركيز على مكونات محددة.
+- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه.
+
+### التنفيذ
+
+#### **الخطوة 1: إنشاء فئة اهتمام مخصصة**
+
+بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة.
+
+```python
+import torch
+import torch.nn as nn
+from transformers.models.sam.modeling_sam import SamVisionAttention
+
+class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
+    def __init__(self, config, window_size):
+        super().__init__(config, window_size)
+        del self.qkv
+        # إسقاطات منفصلة q و k و v
+        self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook)
+
+    def split_q_k_v_load_hook(self, state_dict, prefix, *args):
+        keys_to_delete = []
+        for key in list(state_dict.keys()):
+            if "qkv." in key:
+                # تقسيم q و k و v من الإسقاط المجمع
+                q, k, v = state_dict[key].chunk(3, dim=0)
+                # استبدال الإسقاطات الفردية q و k و v
+                state_dict[key.replace("qkv.", "q.")] = q
+                state_dict[key.replace("qkv.", "k.")] = k
+                state_dict[key.replace("qkv.", "v.")] = v
+                # وضع علامة على مفتاح qkv القديم للحذف
+                keys_to_delete.append(key)
+        
+        # حذف مفاتيح qkv القديمة
+        for key in keys_to_delete:
+            del state_dict[key]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        qkv_shapes = (batch_size *  self.num_attention_heads,  height * width, -1)
+        query = self.q(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        key = self.k(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        value = self.v(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+        return outputs
+```
+
+**الشرح:**
+
+- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`.
+- **دالة استدعاء  تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب.
+- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد.
+
+#### **الخطوة 2: استبدال فئة الانتباه الأصلية**
+
+استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة.
+
+```python
+from transformers import SamModel
+from transformers.models.sam import modeling_sam
+
+# استبدال فئة الاهتمام في وحدة نمطية modeling_sam
+modeling_sam.SamVisionAttention = SamVisionAttentionSplit
+
+# تحميل نموذج SAM المسبق التدريب
+model = SamModel.from_pretrained("facebook/sam-vit-base")
+```
+
+**الشرح:**
+
+- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا.
+- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة.
+
+#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة**
+
+مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`.
+
+```python
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q", "v"],  # تطبيق LoRA على إسقاطات q و v
+    lora_dropout=0.1,
+    task_type="mask-generation"
+)
+
+# تطبيق LoRA على النموذج
+model = get_peft_model(model, config)
+```
+
+**الشرح:**
+
+- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة.
+- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج.
+- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة.
+
+#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب**
+
+من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك.
+
+```python
+model.print_trainable_parameters()
+```
+
+**الناتج المتوقع:**
+
+```
+عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447
+عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k
+```
+
+## المساهمة بابداعاتك الخاصة
+
+يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة.
+
+إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة.
+
+- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع.
+- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك.
+- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة.
diff --git a/docs/transformers/docs/source/ar/index.md b/docs/transformers/docs/source/ar/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..c37dbd1c6d9fc3a313b522d0a88c01606a12bac6
--- /dev/null
+++ b/docs/transformers/docs/source/ar/index.md
@@ -0,0 +1,342 @@
+# 🤗 Transformers: لمحة عامة
+
+أحدث ما في مجال التعلم الآلي لـ [PyTorch](https://pytorch.org/) و [TensorFlow](https://www.tensorflow.org/) و [JAX](https://jax.readthedocs.io/en/latest/)
+
+توفر 🤗 Transformers واجهات برمجة التطبيقات (APIs) والأدوات اللازمة لتنزيل وتدريب أحدث النماذج المسبقة التدريب بسهولة. ويمكن أن يقلل استخدام النماذج المسبقة التدريب من تكاليف الحوسبة والحد من الأثر البيئي، وتوفّر الوقت والموارد اللازمين لتدريب نموذج من الصفر. وتدعم هذه النماذج المهام الشائعة في مجالات مختلفة، مثل:
+
+
+📝 **معالجة اللغات الطبيعية**: تصنيف النصوص، وتعريف الكيانات المسماة، والإجابة على الأسئلة، ونمذجة اللغة، والتلخيص، والترجمة، والاختيار من متعدد، وتوليد النصوص. <br>
+🖼️ **الرؤية الحاسوبية**: تصنيف الصور، وكشف الأشياء، وتجزئتها. <br>
+🗣️ **الصوت**: التعرف التلقائي على الكلام، وتصنيف الصوت. <br>
+🐙 **متعدد الوسائط**: الإجابة على الأسئلة الجدولية، والتعرف البصري على الحروف، واستخراج المعلومات من المستندات الممسوحة ضوئيًا، وتصنيف الفيديو، والإجابة على الأسئلة البصرية.
+
+تدعم 🤗 Transformers التوافق بين أطر العمل المختلفة مثل PyTorch و TensorFlow و JAX. ويوفر ذلك المرونة لاستخدام إطار عمل مختلف في كل مرحلة من مراحل حياة النموذج؛ قم بتدريب نموذج في ثلاث خطوط من التعليمات البرمجية في إطار واحد، وقم بتحميله للاستدلال في إطار آخر. ويمكن أيضًا تصدير النماذج إلى صيغ مثل ONNX و TorchScript للنشر في بيئات الإنتاج.
+
+انضم إلى المجتمع المتنامي على [Hub](https://huggingface.co/models) أو [المنتدى](https://discuss.huggingface.co/) أو [Discord](https://discord.com/invite/JfAtkvEtRb) اليوم!
+
+## إذا كنت تبحث عن دعم مخصص من فريق Hugging Face
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## المحتويات
+
+ينقسم التوثيق إلى خمسة أقسام:
+
+- **ابدأ** تقدم جولة سريعة في المكتبة وتعليمات التثبيت للبدء.
+- **الدروس التعليمية** هي مكان رائع للبدء إذا كنت مبتدئًا. سيساعدك هذا القسم على اكتساب المهارات الأساسية التي تحتاجها للبدء في استخدام المكتبة.
+- **أدلة كيفية الاستخدام** تُظهر لك كيفية تحقيق هدف محدد، مثل ضبط نموذج مسبق التدريب لنمذجة اللغة أو كيفية كتابة ومشاركة نموذج مخصص.
+- **الأدلة المفاهيمية** تقدم مناقشة وتفسيرًا أكثر للأفكار والمفاهيم الأساسية وراء النماذج والمهام وفلسفة التصميم في 🤗 Transformers.
+- **واجهة برمجة التطبيقات (API)** تصف جميع الفئات والوظائف:
+
+  - **الفئات الرئيسية** تشرح الفئات الأكثر أهمية مثل التكوين والنمذجة والتحليل النصي وخط الأنابيب.
+  - **النماذج** تشرح الفئات والوظائف المتعلقة بكل نموذج يتم تنفيذه في المكتبة.
+  - **المساعدون الداخليون** يشرحون فئات ووظائف المساعدة التي يتم استخدامها داخليًا.
+
+
+## النماذج والأطر المدعومة
+
+يمثل الجدول أدناه الدعم الحالي في المكتبة لكل من هذه النماذج، وما إذا كان لديها محلل نحوي Python (يُسمى "بطيء"). محلل نحوي "سريع" مدعوم بمكتبة 🤗 Tokenizers، وما إذا كان لديها دعم في Jax (عبر Flax) و/أو PyTorch و/أو TensorFlow.
+
+<!--يتم تحديث هذا الجدول تلقائيًا من الوحدات النمطية التلقائية مع _make fix-copies_. لا تقم بالتحديث يدويًا!-->
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|                                  Model                                   | PyTorch support | TensorFlow support | Flax Support |
+|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
+|                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
+|                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
+|                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
+| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
+|                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
+|                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
+|                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
+|                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
+|                       [BARTpho](model_doc/bartpho)                       |       ✅        |         ✅         |      ✅      |
+|                          [BEiT](model_doc/beit)                          |       ✅        |         ❌         |      ✅      |
+|                          [BERT](model_doc/bert)                          |       ✅        |         ✅         |      ✅      |
+|               [Bert Generation](model_doc/bert-generation)               |       ✅        |         ❌         |      ❌      |
+|                 [BertJapanese](model_doc/bert-japanese)                  |       ✅        |         ✅         |      ✅      |
+|                      [BERTweet](model_doc/bertweet)                      |       ✅        |         ✅         |      ✅      |
+|                      [BigBird](model_doc/big_bird)                       |       ✅        |         ❌         |      ✅      |
+|               [BigBird-Pegasus](model_doc/bigbird_pegasus)               |       ✅        |         ❌         |      ❌      |
+|                        [BioGpt](model_doc/biogpt)                        |       ✅        |         ❌         |      ❌      |
+|                           [BiT](model_doc/bit)                           |       ✅        |         ❌         |      ❌      |
+|                    [Blenderbot](model_doc/blenderbot)                    |       ✅        |         ✅         |      ✅      |
+|              [BlenderbotSmall](model_doc/blenderbot-small)               |       ✅        |         ✅         |      ✅      |
+|                          [BLIP](model_doc/blip)                          |       ✅        |         ✅         |      ❌      |
+|                        [BLIP-2](model_doc/blip-2)                        |       ✅        |         ❌         |      ❌      |
+|                         [BLOOM](model_doc/bloom)                         |       ✅        |         ❌         |      ✅      |
+|                          [BORT](model_doc/bort)                          |       ✅        |         ✅         |      ✅      |
+|                   [BridgeTower](model_doc/bridgetower)                   |       ✅        |         ❌         |      ❌      |
+|                          [BROS](model_doc/bros)                          |       ✅        |         ❌         |      ❌      |
+|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
+|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
+|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
+|                     [Chameleon](model_doc/chameleon)                     |       ✅        |         ❌         |      ❌      |
+|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
+|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
+|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
+|                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
+|                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
+|                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
+|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
+|                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
+|              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
+|                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
+|                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
+|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ✅         |      ❌      |
+|                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
+|                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
+|                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
+|                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
+|                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
+|                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
+|                          [DBRX](model_doc/dbrx)                          |       ✅        |         ❌         |      ❌      |
+|                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
+|                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
+|          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
+|               [Deformable DETR](model_doc/deformable_detr)               |       ✅        |         ❌         |      ❌      |
+|                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
+|                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
+|                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
+|                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
+|                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
+|                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
+|                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
+|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
+|                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
+|                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
+|                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
+|                           [DPR](model_doc/dpr)                           |       ✅        |         ✅         |      ❌      |
+|                           [DPT](model_doc/dpt)                           |       ✅        |         ❌         |      ❌      |
+|               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
+|                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
+|                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
+|                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
+|               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
+|                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
+|                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
+|                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
+|              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
+|                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
+|                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
+|         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
+|                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
+|                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
+|                      [FlauBERT](model_doc/flaubert)                      |       ✅        |         ✅         |      ❌      |
+|                         [FLAVA](model_doc/flava)                         |       ✅        |         ❌         |      ❌      |
+|                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
+|                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
+|                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
+|                          [Fuyu](model_doc/fuyu)                          |       ✅        |         ❌         |      ❌      |
+|                         [Gemma](model_doc/gemma)                         |       ✅        |         ❌         |      ✅      |
+|                        [Gemma2](model_doc/gemma2)                        |       ✅        |         ❌         |      ❌      |
+|                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
+|                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
+|                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
+|                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
+|             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
+|                         [GPT-J](model_doc/gptj)                          |       ✅        |         ✅         |      ✅      |
+|                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
+|                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
+|               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
+|                       [Granite](model_doc/granite)                       |       ✅        |         ❌         |      ❌      |
+|                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
+|                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
+|                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
+|                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
+|                         [Hiera](model_doc/hiera)                         |       ✅        |         ❌         |      ❌      |
+|                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
+|                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
+|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
+|                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
+|                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
+|                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
+|                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
+|             [InstructBlipVideo](model_doc/instructblipvideo)             |       ✅        |         ❌         |      ❌      |
+|                         [Jamba](model_doc/jamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [JetMoe](model_doc/jetmoe)                        |       ✅        |         ❌         |      ❌      |
+|                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
+|                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
+|                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
+|                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
+|                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
+|                     [LayoutXLM](model_doc/layoutxlm)                     |       ✅        |         ❌         |      ❌      |
+|                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
+|                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
+|                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
+|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ✅      |
+|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ✅      |
+|                        [Llama3](model_doc/llama3)                        |       ✅        |         ❌         |      ✅      |
+|                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
+|                    [LLaVA-NeXT](model_doc/llava_next)                    |       ✅        |         ❌         |      ❌      |
+|              [LLaVa-NeXT-Video](model_doc/llava_next_video)              |       ✅        |         ❌         |      ❌      |
+|                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
+|                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
+|                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
+|                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
+|                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
+|                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
+|                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
+|                         [Mamba](model_doc/mamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [mamba2](model_doc/mamba2)                        |       ✅        |         ❌         |      ❌      |
+|                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
+|                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
+|                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
+|                    [MaskFormer](model_doc/maskformer)                    |       ✅        |         ❌         |      ❌      |
+|                        [MatCha](model_doc/matcha)                        |       ✅        |         ❌         |      ❌      |
+|                         [mBART](model_doc/mbart)                         |       ✅        |         ✅         |      ✅      |
+|                      [mBART-50](model_doc/mbart50)                       |       ✅        |         ✅         |      ✅      |
+|                          [MEGA](model_doc/mega)                          |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
+|                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
+|                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
+|                       [Mistral](model_doc/mistral)                       |       ✅        |         ✅         |      ✅      |
+|                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
+|                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
+|                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
+|                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
+|                  [MobileNetV1](model_doc/mobilenet_v1)                   |       ✅        |         ❌         |      ❌      |
+|                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
+|                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
+|                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
+|                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
+|                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
+|                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
+|                           [MT5](model_doc/mt5)                           |       ✅        |         ✅         |      ✅      |
+|                      [MusicGen](model_doc/musicgen)                      |       ✅        |         ❌         |      ❌      |
+|               [MusicGen Melody](model_doc/musicgen_melody)               |       ✅        |         ❌         |      ❌      |
+|                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
+|                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
+|                      [Nemotron](model_doc/nemotron)                      |       ✅        |         ❌         |      ❌      |
+|                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
+|                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
+|                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
+|                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
+|                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
+|                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
+|                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
+|                    [OpenAI GPT](model_doc/openai-gpt)                    |       ✅        |         ✅         |      ❌      |
+|                      [OpenAI GPT-2](model_doc/gpt2)                      |       ✅        |         ✅         |      ✅      |
+|                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
+|                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
+|                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
+|                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
+|                     [PaliGemma](model_doc/paligemma)                     |       ✅        |         ❌         |      ❌      |
+|                  [PatchTSMixer](model_doc/patchtsmixer)                  |       ✅        |         ❌         |      ❌      |
+|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
+|                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
+|                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
+|                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
+|                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
+|                           [Phi](model_doc/phi)                           |       ✅        |         ❌         |      ❌      |
+|                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
+|                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
+|                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
+|                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
+|                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
+|                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
+|                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
+|                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
+|                        [PVTv2](model_doc/pvt_v2)                         |       ✅        |         ❌         |      ❌      |
+|                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
+|                         [Qwen2](model_doc/qwen2)                         |       ✅        |         ❌         |      ❌      |
+|                   [Qwen2Audio](model_doc/qwen2_audio)                    |       ✅        |         ❌         |      ❌      |
+|                     [Qwen2MoE](model_doc/qwen2_moe)                      |       ✅        |         ❌         |      ❌      |
+|                      [Qwen2VL](model_doc/qwen2_vl)                       |       ✅        |         ❌         |      ❌      |
+|                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
+|                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
+|               [RecurrentGemma](model_doc/recurrent_gemma)                |       ✅        |         ❌         |      ❌      |
+|                      [Reformer](model_doc/reformer)                      |       ✅        |         ❌         |      ❌      |
+|                        [RegNet](model_doc/regnet)                        |       ✅        |         ✅         |      ✅      |
+|                       [RemBERT](model_doc/rembert)                       |       ✅        |         ✅         |      ❌      |
+|                        [ResNet](model_doc/resnet)                        |       ✅        |         ✅         |      ✅      |
+|                     [RetriBERT](model_doc/retribert)                     |       ✅        |         ❌         |      ❌      |
+|                       [RoBERTa](model_doc/roberta)                       |       ✅        |         ✅         |      ✅      |
+|          [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)          |       ✅        |         ✅         |      ✅      |
+|                      [RoCBert](model_doc/roc_bert)                       |       ✅        |         ❌         |      ❌      |
+|                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
+|                       [RT-DETR](model_doc/rt_detr)                       |       ✅        |         ❌         |      ❌      |
+|                [RT-DETR-ResNet](model_doc/rt_detr_resnet)                |       ✅        |         ❌         |      ❌      |
+|                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
+|                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
+|                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
+|                [SeamlessM4Tv2](model_doc/seamless_m4t_v2)                |       ✅        |         ❌         |      ❌      |
+|                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
+|                        [SegGPT](model_doc/seggpt)                        |       ✅        |         ❌         |      ❌      |
+|                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
+|                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
+|                        [SigLIP](model_doc/siglip)                        |       ✅        |         ❌         |      ❌      |
+|        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
+|                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
+|                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
+|                      [Splinter](model_doc/splinter)                      |       ✅        |         ❌         |      ❌      |
+|                   [SqueezeBERT](model_doc/squeezebert)                   |       ✅        |         ❌         |      ❌      |
+|                      [StableLm](model_doc/stablelm)                      |       ✅        |         ❌         |      ❌      |
+|                    [Starcoder2](model_doc/starcoder2)                    |       ✅        |         ❌         |      ❌      |
+|                    [SuperPoint](model_doc/superpoint)                    |       ✅        |         ❌         |      ❌      |
+|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ✅         |      ❌      |
+|                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
+|                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
+|                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
+|           [SwitchTransformers](model_doc/switch_transformers)            |       ✅        |         ❌         |      ❌      |
+|                            [T5](model_doc/t5)                            |       ✅        |         ✅         |      ✅      |
+|                        [T5v1.1](model_doc/t5v1.1)                        |       ✅        |         ✅         |      ✅      |
+|             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
+|                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
+|                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
+|                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
+|        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
+|                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
+|                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
+|                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
+|                           [TVP](model_doc/tvp)                           |       ✅        |         ❌         |      ❌      |
+|                          [UDOP](model_doc/udop)                          |       ✅        |         ❌         |      ❌      |
+|                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
+|                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
+|                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
+|                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
+|                       [UnivNet](model_doc/univnet)                       |       ✅        |         ❌         |      ❌      |
+|                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
+|                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
+|                   [VideoLlava](model_doc/video_llava)                    |       ✅        |         ❌         |      ❌      |
+|                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
+|                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
+|                      [VipLlava](model_doc/vipllava)                      |       ✅        |         ❌         |      ❌      |
+|        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
+|       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
+|                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
+|                           [ViT](model_doc/vit)                           |       ✅        |         ✅         |      ✅      |
+|                    [ViT Hybrid](model_doc/vit_hybrid)                    |       ✅        |         ❌         |      ❌      |
+|                        [VitDet](model_doc/vitdet)                        |       ✅        |         ❌         |      ❌      |
+|                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
+|                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
+|                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
+|                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
+|                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
+|                 [Wav2Vec2-BERT](model_doc/wav2vec2-bert)                 |       ✅        |         ❌         |      ❌      |
+|            [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)            |       ✅        |         ❌         |      ❌      |
+|              [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)               |       ✅        |         ✅         |      ✅      |
+|                         [WavLM](model_doc/wavlm)                         |       ✅        |         ❌         |      ❌      |
+|                       [Whisper](model_doc/whisper)                       |       ✅        |         ✅         |      ✅      |
+|                        [X-CLIP](model_doc/xclip)                         |       ✅        |         ❌         |      ❌      |
+|                         [X-MOD](model_doc/xmod)                          |       ✅        |         ❌         |      ❌      |
+|                          [XGLM](model_doc/xglm)                          |       ✅        |         ✅         |      ✅      |
+|                           [XLM](model_doc/xlm)                           |       ✅        |         ✅         |      ❌      |
+|                [XLM-ProphetNet](model_doc/xlm-prophetnet)                |       ✅        |         ❌         |      ❌      |
+|                   [XLM-RoBERTa](model_doc/xlm-roberta)                   |       ✅        |         ✅         |      ✅      |
+|                [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)                |       ✅        |         ❌         |      ❌      |
+|                         [XLM-V](model_doc/xlm-v)                         |       ✅        |         ✅         |      ✅      |
+|                         [XLNet](model_doc/xlnet)                         |       ✅        |         ✅         |      ❌      |
+|                         [XLS-R](model_doc/xls_r)                         |       ✅        |         ✅         |      ✅      |
+|                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
+|                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
+|                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
+|                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/transformers/docs/source/ar/installation.md b/docs/transformers/docs/source/ar/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3bd4c655b6038238ec7bacf3a2723685a7c3a21
--- /dev/null
+++ b/docs/transformers/docs/source/ar/installation.md
@@ -0,0 +1,246 @@
+# التثبيت (Installation)
+
+قم بتثبيت مكتبة 🤗 Transformers المناسبة لمكتبة التعلم العميق التي تستخدمها، وقم بإعداد ذاكرة التخزين المؤقت الخاصة بك، وقم بإعداد 🤗 Transformers للعمل دون اتصال بالإنترنت (اختياري).
+
+تم اختبار 🤗 Transformers على Python 3.6  والإصدارات الأحدث، وPyTorch 1.1.0 والإصدارات الأحدث، وTensorFlow 2.0 والإصدارات الأحدث، وFlax. اتبع تعليمات التثبيت أدناه لمكتبة التعلم العميق التي تستخدمها:
+
+* تعليمات تثبيت [PyTorch](https://pytorch.org/get-started/locally/).
+* تعليمات تثبيت [TensorFlow 2.0](https://www.tensorflow.org/install/pip).
+* تعليمات تثبيت [Flax](https://flax.readthedocs.io/en/latest/).
+
+## التثبيت باستخدام pip
+
+يجب عليك تثبيت 🤗 Transformers داخل [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا لم تكن غير ملم ببيئات Python الافتراضية، فراجع هذا [الدليل](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). البيئة الافتراضية تسهل إدارة المشاريع المختلف، وتجنب مشكلات التوافق بين المكتبات المطلوبة (اعتماديات المشروع).
+
+ابدأ بإنشاء بيئة افتراضية في دليل مشروعك:
+
+```bash
+python -m venv .env
+```
+
+قم بتفعيل البيئة الافتراضية. على Linux وMacOs:
+
+```bash
+source .env/bin/activate
+```
+
+قم بتفعيل البيئة الافتراضية على Windows:
+
+```bash
+.env/Scripts/activate
+```
+
+الآن أنت مستعد لتثبيت 🤗 Transformers باستخدام الأمر التالي:
+
+```bash
+pip install transformers
+```
+
+للحصول على الدعم الخاص بـ CPU فقط، يمكنك تثبيت 🤗 Transformers ومكتبة التعلم العميق في خطوة واحدة. على سبيل المثال، قم بتثبيت 🤗 Transformers وPyTorch باستخدام:
+
+```bash
+pip install 'transformers[torch]'
+```
+
+🤗 Transformers وTensorFlow 2.0:
+
+```bash
+pip install 'transformers[tf-cpu]'
+```
+
+<Tip warning={true}>
+
+لمستخدمي M1 / ARM
+
+ستحتاج إلى تثبيت ما يلي قبل تثبيت TensorFLow 2.0
+```bash
+brew install cmake
+brew install pkg-config
+```
+
+</Tip>
+
+🤗 Transformers وFlax:
+
+```bash
+pip install 'transformers[flax]'
+```
+
+أخيرًا، تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي. سيقوم بتنزيل نموذج مدرب مسبقًا:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+ثم قم بطباعة التسمية والنتيجة:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## التثبيت من المصدر
+
+قم بتثبيت 🤗 Transformers من المصدر باستخدام الأمر التالي:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+يقوم هذا الأمر بتثبيت  أحدث إصدار تجريبي `main`  بدلاً من الإصدار المستقر `stable`. يعد إصدار `main` مفيدًا للمواكبة مع أحدث التطورات. على سبيل المثال، إذا تم إصلاح خطأ منذ الإصدار الرسمي الأخير ولكن لم يتم طرح إصدار جديد بعد. ومع ذلك، فإن هذا يعني أن إصدار التجريبي `main` قد لا يكون مستقرًا دائمًا. نسعى جاهدين للحفاظ على تشغيل إصدار `main`، ويتم حل معظم المشكلات عادةً في غضون بضع ساعات أو يوم. إذا واجهتك مشكلة، يرجى فتح [تقرير عن خلل](https://github.com/huggingface/transformers/issues) حتى نتمكن من إصلاحها في أقرب وقت ممكن!
+
+تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+تحقق مما إذا كان 🤗 Transformers قد تم تثبيته بشكل صحيح عن طريق تشغيل الأمر التالي:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## التثبيت القابل للتعديل
+
+ستحتاج إلى تثبيت قابل للتعديل إذا كنت ترغب في:
+
+* استخدام إصدار `main` من كود المصدر.
+* المساهمة في 🤗 Transformers وتحتاج إلى اختبار التغييرات في الكود.
+
+قم باستنساخ المستودع وقم بتثبيت 🤗 Transformers باستخدام الأوامر التالية:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+ ستقوم هذه الأوامر بربط المجلد الذي قمت باستنساخ المستودع فيه بمسارات مكتبة Python. بمعنى آخر، سيبحث Python داخل المجلد الذي قمت باستنساخه بالإضافة إلى المسارات المعتادة للمكتبات. على سبيل المثال، إذا تم تثبيت حزم Python الخاصة بك عادةً في `~/anaconda3/envs/main/lib/python3.7/site-packages/`, فسيقوم Python أيضًا بالبحث في المجلد الذي قمت باستنساخه: `~/transformers/`.
+
+<Tip warning={true}>
+
+يجب عليك الاحتفاظ بمجلد `transformers` إذا كنت تريد الاستمرار في استخدام المكتبة.
+
+</Tip>
+
+الآن يمكنك تحديث المستنسخ الخاص بك بسهولة إلى أحدث إصدار من 🤗 Transformers باستخدام الأمر التالي:
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+ستجد بيئة Python الإصدار `main` من 🤗 Transformers في المرة التالية التي تقوم فيها بتشغيله.
+
+## التثبيت باستخدام conda
+
+قم بالتثبيت من قناة conda `conda-forge`:
+
+```bash
+conda install conda-forge::transformers
+```
+
+## إعداد ذاكرة التخزين المؤقت
+
+تُحمّل النماذج المُسبقة التدريب وتُخزّن مؤقتًا في: `~/.cache/huggingface/hub`. هذا هو المجلد الافتراضي الذي يُحدده متغير البيئة `TRANSFORMERS_CACHE`. على Windows، يكون دليل ذاكرة التخزين المؤقت الافتراضي هو `C:\Users\username\.cache\huggingface\hub`. يمكنك تغيير متغيرات البيئة shell الموضحة أدناه - حسب الأولوية - لتحديد دليل ذاكرة تخزين مؤقت مختلف:
+
+1. متغير البيئة (افتراضي): `HF_HUB_CACHE` أو `TRANSFORMERS_CACHE`.
+2. متغير البيئة: `HF_HOME`.
+3. متغير البيئة: `XDG_CACHE_HOME` + `/huggingface`.
+
+<Tip>
+
+سيستخدم 🤗 Transformers متغيرات البيئة `PYTORCH_TRANSFORMERS_CACHE` أو `PYTORCH_PRETRAINED_BERT_CACHE` إذا كنت قادمًا من إصدار سابق من هذه المكتبة وقمت بتعيين متغيرات البيئة هذه، ما لم تحدد متغير البيئة `TRANSFORMERS_CACHE`.
+
+</Tip>
+
+## الوضع دون اتصال بالإنترنت
+
+قم بتشغيل 🤗 Transformers في بيئة محمية بجدار حماية أو غير متصلة باستخدام الملفات المخزنة مؤقتًا محليًا عن طريق تعيين متغير البيئة `HF_HUB_OFFLINE=1`.
+
+<Tip>
+
+أضف [🤗 Datasets](https://huggingface.co/docs/datasets/) إلى سير عمل التدريب غير المتصل باستخدام متغير البيئة `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+```bash
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+يجب أن يعمل هذا البرنامج النصي دون توقف أو انتظار انتهاء المهلة الزمنية لأنه لن يحاول تنزيل النموذج من Hub.
+
+يمكنك أيضًا تجاوز تحميل نموذج من Hub من كل استدعاء [`~PreTrainedModel.from_pretrained`] باستخدام معلمة [`local_files_only`]. عندما يتم تعيينها على `True`، يتم تحميل الملفات المحلية فقط:
+
+```py
+from transformers import T5Model
+
+model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
+```
+
+### جلب النماذج والمُجزّئات لاستخدامها دون اتصال بالإنترنت
+
+خيار آخر لاستخدام 🤗 Transformers دون اتصال هو تنزيل الملفات مسبقًا، ثم الإشارة إلى مسارها المحلي عند الحاجة إلى استخدامها دون اتصال. هناك ثلاث طرق للقيام بذلك:
+
+* قم بتنزيل ملف عبر واجهة المستخدم على [Model Hub](https://huggingface.co/models) بالنقر فوق أيقونة ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* استخدم سير عمل [`PreTrainedModel.from_pretrained`] و [`PreTrainedModel.save_pretrained`]:
+
+    1. قم بتنزيل ملفاتك مسبقًا باستخدام [`PreTrainedModel.from_pretrained`]:
+* استخدم سير عمل [`PreTrainedModel.from_pretrained`] و [`PreTrainedModel.save_pretrained`]:
+
+    1. قم بتنزيل ملفاتك مسبقًا باستخدام [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. احفظ ملفاتك إلى دليل محدد باستخدام [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. الآن عندما تكون غير متصل بالإنترنت، أعد تحميل ملفاتك باستخدام [`PreTrainedModel.from_pretrained`] من الدليل المحدد:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* قم بتنزيل الملفات برمجيًا باستخدام مكتبة [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub):
+
+    1. قم بتثبيت مكتبة `huggingface_hub` في بيئتك الافتراضية:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. استخدم وظيفة [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) لتنزيل ملف إلى مسار محدد. على سبيل المثال، يقوم الأمر التالي بتنزيل ملف `config.json` من نموذج [T0](https://huggingface.co/bigscience/T0_3B) إلى المسار المطلوب:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+بمجرد تنزيل ملفك وتخزينه مؤقتًا محليًا، حدد مساره المحلي الخاص به لتحميله واستخدامه:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+راجع قسم [كيفية تنزيل الملفات من Hub](https://huggingface.co/docs/hub/how-to-downstream) لمزيد من التفاصيل حول تنزيل الملفات المخزنة على Hub.
+
+</Tip>
diff --git a/docs/transformers/docs/source/ar/llm_tutorial.md b/docs/transformers/docs/source/ar/llm_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..264797a982b9ade0c70c29d7acf3c10ff498dc0b
--- /dev/null
+++ b/docs/transformers/docs/source/ar/llm_tutorial.md
@@ -0,0 +1,248 @@
+# التوليد باستخدام نماذج اللغات الكبيرة (LLMs)
+
+[[open-in-colab]]
+
+تعد LLMs، أو نماذج اللغة الكبيرة، المكون الرئيسي وراء توليد النصوص. وباختصار، تتكون من نماذج محول كبيرة مسبقة التدريب تم تدريبها للتنبؤ بالكلمة التالية (أو، بشكل أكثر دقة، الرمز اللغوي) بالنظر إلى نص معين. نظرًا لأنها تتنبأ برمز واحد في كل مرة، يجب عليك القيام بشيء أكثر تعقيدًا لتوليد جمل جديدة بخلاف مجرد استدعاء النموذج - يجب عليك إجراء التوليد التلقائي.
+
+التوليد التلقائي هو إجراء وقت الاستدلال الذي يتضمن استدعاء النموذج بشكل متكرر باستخدام مخرجاته الخاصة، بالنظر إلى بعض المدخلات الأولية. في 🤗 Transformers، يتم التعامل مع هذا بواسطة دالة [`~generation.GenerationMixin.generate`]، والتي تتوفر لجميع النماذج ذات القدرات التوليدية.
+
+سيوضح هذا البرنامج التعليمي كيفية:
+
+* تتوليد نص باستخدام نموذج اللغات الكبيرة (LLM)
+* تجنب الوقوع في الأخطاء الشائعة
+* الخطوات التالية لمساعدتك في الاستفادة القصوى من LLM الخاص بك
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+## توليد النص
+
+يأخذ نموذج اللغة المدرب لـ [نمذجة اللغة السببية](tasks/language_modeling) يأخذ تسلسلًا من رموز نصية كمدخل ويعيد توزيع الاحتمالية للرمز التالي.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"التنبؤ بالكلمة التالية لنموذج اللغة (LLM)"</figcaption>
+</figure>
+
+هناك جانب بالغ الأهمية في التوليد التلقائي باستخدام LLMs وهو كيفية اختيار الرمز التالي من توزيع الاحتمالية هذا. كل شيء مسموح به في هذه الخطوة طالما أنك تنتهي برمز للتكرار التالي. وهذا يعني أنه يمكن أن يكون بسيطًا مثل اختيار الرمز الأكثر احتمالًا من توزيع الاحتمالية أو معقدًا مثل تطبيق عشرات التحولات قبل أخذ العينات من التوزيع الناتج.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"التوليد التلقائي المتسلسل"</figcaption>
+</figure>
+
+تتكرر العملية الموضحة أعلاه بشكل تكراري حتى يتم الوصول إلى شرط التوقف. في الوضع المثالي، يحدد النموذج شرط التوقف، والذي يجب أن يتعلم عند إخراج رمز نهاية التسلسل (`EOS`). إذا لم يكن الأمر كذلك، يتوقف التوليد عند الوصول إلى طول أقصى محدد مسبقًا.
+
+من الضروري إعداد خطوة اختيار الرمز وشرط التوقف بشكل صحيح لجعل نموذجك يتصرف كما تتوقع في مهمتك. ولهذا السبب لدينا [`~generation.GenerationConfig`] ملف مرتبط بكل نموذج، والذي يحتوي على معلمة توليدية افتراضية جيدة ويتم تحميله جنبًا إلى جنب مع نموذجك.
+
+دعنا نتحدث عن الكود!
+
+
+<Tip>
+
+إذا كنت مهتمًا بالاستخدام الأساسي لـ LLM، فإن واجهة [`Pipeline`](pipeline_tutorial) عالية المستوى هي نقطة انطلاق رائعة. ومع ذلك، غالبًا ما تتطلب LLMs ميزات متقدمة مثل التكميم والتحكم الدقيق في خطوة اختيار الرمز، والتي يتم تنفيذها بشكل أفضل من خلال [`~generation.GenerationMixin.generate`]. التوليد التلقائي باستخدام LLMs  يستهلك الكثير من المواردد ويجب تنفيذه على وحدة معالجة الرسومات للحصول على أداء كافٍ.
+
+</Tip>
+
+أولاً، تحتاج إلى تحميل النموذج.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+... )
+```
+
+ستلاحظ وجود معاملين في الاستدعاء `from_pretrained`:
+
+ - `device_map` يضمن انتقال النموذج إلى وحدة معالجة الرسومات (GPU) الخاصة بك
+ - `load_in_4bit` يطبق [4-bit dynamic quantization](main_classes/quantization) لخفض متطلبات الموارد بشكل كبير
+
+هناك طرق أخرى لتهيئة نموذج، ولكن هذا خط أساس جيد للبدء باستخدام LLM.
+
+بعد ذلك، تحتاج إلى معالجة إدخال النص الخاص بك باستخدام [مُجزّئ اللغوي](tokenizer_summary).
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+يحتوي متغير `model_inputs` على النص المدخل بعد تقسيمه إلى وحدات لغوية (tokens)، بالإضافة إلى قناع الانتباه. في حين أن [`~generation.GenerationMixin.generate`] تبذل قصارى جهدها لاستنتاج قناع الانتباه عندما لا يتم تمريره، نوصي بتمريره كلما أمكن ذلك للحصول على نتائج مثالية.
+
+بعد تقسيم المدخلات إلى وحدات لغوية، يمكنك استدعاء الدالة [`~generation.GenerationMixin.generate`] لإرجاع الوحدات اللغوية الناتجة. يجب بعد ذلك تحويل الوحدات المولدة إلى نص قبل طباعته.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, orange, purple, pink,'
+```
+
+أخيرًا، ليس عليك معالجة المتتاليات الواحدة تلو الأخرى! يمكنك معالجة مجموعة من المدخلات دفعة واحدة، والتي ستعمل على تحسين الإنتاجية بشكل كبير بتكلفة صغيرة في زمن الاستجابة واستهلاك الذاكر. كل ما عليك التأكد منه هو  تعبئة المدخلات بشكل صحيح (المزيد حول ذلك أدناه).
+
+```py
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model_inputs = tokenizer(
+...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+['A list of colors: red, blue, green, yellow, orange, purple, pink,',
+'Portugal is a country in southwestern Europe, on the Iber']
+```
+
+وهذا كل شيء! في بضع سطور من التعليمات البرمجية، يمكنك تسخير قوة LLM.
+
+## الأخطاء الشائعة
+
+هناك العديد من [استراتيجيات التوليد](generation_strategies)، وفي بعض الأحيان قد لا تكون القيم الافتراضية مناسبة لحالتك الاستخدام. إذا لم تكن الإخراج الخاصة بك متوافقة مع ما تتوقعه، فقد قمنا بإنشاء قائمة بأكثر الأخطاء الشائعة وكيفية تجنبها.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+... )
+```
+
+### الإخراج المولد قصير جدًا/طويل جدًا
+
+إذا لم يتم تحديد العدد الأقصى للرموز في [`~generation.GenerationConfig`] الملف، `generate` يعيد ما يصل إلى 20 رمزًا بشكل افتراضي. نوصي بشدة بتعيين `max_new_tokens` يدويًا في مكالمة `generate` للتحكم في العدد الأقصى من الرموز الجديدة التي يمكن أن يعيدها. ضع في اعتبارك أن LLMs (بشكل أكثر دقة، [نماذج فك التشفير فقط](https://huggingface.co/learn/nlp-course/chapter1/6؟fw=pt)) تعيد أيضًا المدخلات الأصلية كجزء من الناتج.
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### وضع التوليد الافتراضي
+
+بشكل افتراضي، وما لم يتم تحديده في [`~generation.GenerationConfig`] الملف، `generate` يحدد الكلمة الأكثر احتمالًا فى كل خطوة من خطوات عملية التوليد (وهذا يُعرف بالتشفير الجشع). اعتمادًا على مهمتك، قد يكون هذا غير مرغوب فيه؛ تستفيد المهام الإبداعية مثل برامج الدردشة أو كتابة مقال ستفيد من أسلوب العينة العشوائية في اختيار الكلمات، تمن ناحية أخرى، فإن المهام التي تعتمد على مدخلات محددة  مثل تحويل الصوت إلى نص أو الترجم من فك التشفير الجشع. قم بتفعيل أسلوب العينات العشوائية باستخدام `do_sample=True`، ويمكنك معرفة المزيد حول هذا الموضوع في [تدوينة المدونة](https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(42)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.  Specifically, I am an indoor-only cat.  I'
+```
+
+### مشكلة حشو المدخلات فى الاتجاة الخطأ
+
+LLMs هي [معماريات فك التشفير فقط](https://huggingface.co/learn/nlp-course/chapter1/6؟fw=pt)، مما يعني أنها تستمر في التكرار على موجه الإدخال الخاص بك. فإن جميع المدخلات يجب أن تكون بنفس الطول. لحل هذه المسألة، يتم إضافة رموز حشو إلى المدخلات الأقصر. نظرًا لأن LLMs  لا تولي اهتمامًا لرموز الحشو هذه، ذلك، يجب تحديد الجزء المهم من المدخل الذي يجب أن يركز عليه النموذج، وهذا يتم عن طريق ما يسمى بـ "قناع الانتباه". يجب أن يكون الحشو في بداية المدخل (الحشو من اليسار)، وليس في نهايته.
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 33333333333'
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+### موجه غير صحيح
+
+تتوقع بعض نماذج اللغات الكبيرة على صيغة محددة للمدخلات  للعمل بشكل صحيح. إذا لم يتم اتباع هذه الصيغة، فإن أداء النموذج يتأثر سلبًا: لكن هذا التدهور قد لا يكون واضحًا للعيان. تتوفر معلومات إضافية حول التوجيه، بما في ذلك النماذج والمهام التي تحتاج إلى توخي الحذر، في [الدليل](tasks/prompting). دعنا نرى مثالاً باستخدام LLM للدردشة، والذي يستخدم [قالب الدردشة](chat_templating):
+```python
+>>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
+... )
+>>> set_seed(0)
+>>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+>>> input_length = model_inputs.input_ids.shape[1]
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+"I'm not a thug, but i can tell you that a human cannot eat"
+>>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
+>>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
+
+>>> set_seed(0)
+>>> messages = [
+...     {
+...         "role": "system",
+...         "content": "You are a friendly chatbot who always responds in the style of a thug",
+...     },
+...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+... ]
+>>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
+>>> input_length = model_inputs.shape[1]
+>>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
+'None, you thug. How bout you try to focus on more useful questions?'
+>>> # As we can see, it followed a proper thug style 😎
+```
+
+## موارد إضافية
+
+في حين أن عملية التوليد التلقائي بسيطة نسبيًا، فإن الاستفادة القصوى من LLM الخاص بك يمكن أن تكون مهمة صعبة لأن هناك العديد من الأجزاء المتحركة. للخطوات التالية لمساعدتك في الغوص بشكل أعمق في استخدام LLM وفهمه:
+
+### استخدامات متقدمة للتوليد في نماذج اللغات الكبيرة
+
+1. دليل حول كيفية [التحكم في طرق التوليد المختلفة](generation_strategies)، وكيفية إعداد ملف تكوين التوليد، وكيفية بث الناتج؛
+2. [تسريع توليد النص](llm_optims)؛
+3.[قوالب موجهات للدردشة LLMs](chat_
+4. [دليل تصميم الموجه](tasks/prompting);
+5. مرجع واجهة برمجة التطبيقات (API)  [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], و  [generate-related classes](internal/generation_utils). والعديد من الفئات الأخرى المرتبطة بعملية التوليد.!
+
+### لوحات صدارة نماذج اللغات الكبيرة
+1. لوحة صدارة نماذج اللغات الكبيرة المفتوحة المصدر (Open LLM Leaderboard): تركز على جودة النماذج مفتوحة المصدر [رابط لوحة الصدارة](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard).
+2. لوحة صدارة أداء نماذج اللغات الكبيرة المفتوحة المصدر (Open LLM-Perf Leaderboard): تركز على إنتاجية نماذج اللغات الكبيرة [رابط لوحة الصدارة](https://huggingface.co/spaces/optimum/llm-perf-leaderboard).
+
+### زمن الاستجابة والإنتاجية واستهلاك الذاكرة
+1. دليل تحسين نماذج اللغات الكبيرة من حيث السرعة والذاكرة: دليل تحسين نماذج اللغات الكبيرة.
+2. التكميم (Quantization): دليل حول تقنية التكميم التكميم مثل تقنيتي bitsandbytes و autogptq، والتي توضح كيفية تقليل متطلبات الذاكرة بشكل كبير.
+
+### مكتبات مرتبطة
+1. [`optimum`](https://github.com/huggingface/optimum), امتداد لمكتبة Transformers يعمل على تحسين الأداء لأجهزة معينة.
+2. [`outlines`](https://github.com/outlines-dev/outlines), مكتبة للتحكم في توليد النصوص (على سبيل المثال، لتوليد ملفات JSON).
+3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), مكتبة للتوليد الموجه بقواعد اللغة الخالية من السياق (على سبيل المثال، JSON، SQL، Python).
+4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), خادم جاهز للإنتاج لنماذج اللغات الكبيرة.
+5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), واجهة مستخدم لتوليد النصوص.   
diff --git a/docs/transformers/docs/source/ar/llm_tutorial_optimization.md b/docs/transformers/docs/source/ar/llm_tutorial_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..59f17f7f8a92c2c997e8c97df8e55d07b76e3c09
--- /dev/null
+++ b/docs/transformers/docs/source/ar/llm_tutorial_optimization.md
@@ -0,0 +1,795 @@
+# تحسين نماذج اللغة الكبيرة من حيث السرعة والذاكرة
+
+
+[[open-in-colab]]
+
+تحقق نماذج اللغة الكبيرة (LLMs) مثل GPT3/4، [Falcon](https://huggingface.co/tiiuae/falcon-40b)، و [Llama](https://huggingface.co/meta-llama/Llama-2-70b-hf) تقدمًا سريعًا في قدرتها على معالجة المهام التي تركز على الإنسان، مما يجعلها أدوات أساسية في الصناعات القائمة على المعرفة الحديثة.
+لا يزال نشر هذه النماذج في المهام الواقعية يمثل تحديًا، ومع ذلك:
+
+-   لكي تظهر نماذج اللغة الكبيرة قدرات فهم وتوليد النصوص قريبة من قدرات الإنسان، فإنها تتطلب حاليًا  إلى تكوينها من مليارات المعلمات (انظر [كابلان وآخرون](https://arxiv.org/abs/2001.08361)، [وي وآخرون](https://arxiv.org/abs/2206.07682)). وهذا بدوره يزيد من متطلبات الذاكرة للاستدلال.
+-   في العديد من المهام الواقعية، تحتاج نماذج اللغة الكبيرة إلى معلومات سياقية شاملة. يتطلب ذلك قدرة النموذج على إدارة تسلسلات إدخال طويلة للغاية أثناء الاستدلال.
+
+يكمن جوهر صعوبة هذه التحديات في تعزيز القدرات الحسابية والذاكرة لنماذج اللغة الكبيرة، خاصة عند التعامل مع تسلسلات الإدخال الضخمة.
+
+في هذا الدليل، سنستعرض التقنيات الفعالة لتُحسِّن من كفاءة نشر نماذج اللغة الكبيرة:
+
+1. سنتناول تقنية "دقة أقل" التي أثبتت الأبحاث فعاليتها في تحقيق مزايا حسابية دون التأثير بشكل ملحوظ على أداء النموذج عن طريق العمل بدقة رقمية أقل [8 بت و4 بت](/main_classes/quantization.md).
+
+2.  **اFlash Attention:** إن Flash Attention وهي نسخة مُعدَّلة من خوارزمية الانتباه التي لا توفر فقط نهجًا أكثر كفاءة في استخدام الذاكرة، ولكنها تحقق أيضًا كفاءة متزايدة بسبب الاستخدام الأمثل لذاكرة GPU.
+
+3.  **الابتكارات المعمارية:** حيث تم اقتراح هياكل متخصصة تسمح باستدلال أكثر فعالية نظرًا لأن نماذج اللغة الكبيرة يتم نشرها دائمًا بنفس الطريقة أثناء عملية الاستدلال، أي توليد النص التنبؤي التلقائي مع سياق الإدخال الطويل، فقد تم اقتراح بنيات نموذج متخصصة تسمح بالاستدلال الأكثر كفاءة. أهم تقدم في بنيات النماذج هنا هو [عذر](https://arxiv.org/abs/2108.12409)، [الترميز الدوار](https://arxiv.org/abs/2104.09864)، [الاهتمام متعدد الاستعلامات (MQA)](https://arxiv.org/abs/1911.02150) و [مجموعة الانتباه بالاستعلام (GQA)]((https://arxiv.org/abs/2305.13245)).
+
+على مدار هذا الدليل، سنقدم تحليلًا للتوليد التنبؤي التلقائي من منظور المُوتِّرات. نتعمق في مزايا وعيوب استخدام دقة أقل، ونقدم استكشافًا شاملاً لخوارزميات الانتباه الأحدث، ونناقش بنيات نماذج نماذج اللغة الكبيرة المحسنة. سندعم الشرح بأمثلة عملية تُبرِز كل تحسين على حدة.
+
+## 1. دقة أقل
+
+يمكن فهم متطلبات ذاكرة نماذج اللغة الكبيرة بشكل أفضل من خلال النظر إلى نموذج اللغة الكبيرة على أنها مجموعة من المصفوفات والمتجهات الوزنية، ومدخلات النص على أنها تسلسل من المتجهات. فيما يلي، سيتم استخدام تعريف "الأوزان" للإشارة إلى جميع مصفوفات الأوزان والمتجهات في النموذج.
+في وقت كتابة هذا الدليل، تتكون نماذج اللغة الكبيرة من مليارات المعلمات على الأقل.كل معلمة يتم تمثيلها برقم عشري مثل 4.5689 `` والذي يتم تخزينه عادةً بتنسيق [float32](https://en.wikipedia.org/wiki/Single-precision_floating-point_format)، [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format)، أو [float16](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) . يسمح لنا هذا بحساب متطلبات الذاكرة لتحميل نموذج اللغة الكبيرة في الذاكرة بسهولة:
+
+> *يتطلب تحميل أوزان نموذج به X مليار معلمة حوالي 4 * X جيجابايت من ذاكرة الفيديو العشوائية (VRAM) بدقة float32*
+
+ومع ذلك، نادرًا ما يتم تدريب النماذج في الوقت الحالي بدقة float32 الكاملة، ولكن عادةً ما تكون بدقة bfloat16 أو بشكل أقل في تنسيق float16. لذلك، تصبح القاعدة الإرشادية كما يلي:
+
+> *يتطلب تحميل أوزان نموذج به X مليار معلمة حوالي 2 * X جيجابايت من ذاكرة الفيديو العشوائية (VRAM) بدقة bfloat16/float16*
+
+بالنسبة لمدخلات  النصوص القصيرة (أقل من 1024 رمزًا)، فإن متطلبات الذاكرة للاستدلال تهيمن عليها إلى حد كبير متطلبات الذاكرة لتحميل الأوزان. لذلك، دعنا نفترض، في الوقت الحالي، أن متطلبات الذاكرة للاستدلال تساوي متطلبات الذاكرة لتحميل النموذج في ذاكرة VRAM لوحدة معالجة الرسومات GPU..
+
+ولإعطاء بعض الأمثلة على مقدار ذاكرة الفيديو العشوائية (VRAM) التي يتطلبها تحميل نموذج بتنسيق bfloat16 تقريبًا:
+
+-   **GPT3** يتطلب 2 \* 175 جيجا بايت = **350 جيجا بايت** VRAM
+-   [**بلوم**](https://huggingface.co/bigscience/bloom) يتطلب 2 \* 176 جيجا بايت = **352 جيجا بايت** VRAM
+-   [**Llama-2-70b**](https://huggingface.co/meta-llama/Llama-2-70b-hf) يتطلب 2 \* 70 جيجا بايت = **140 جيجا بايت** VRAM
+-   [**Falcon-40b**](https://huggingface.co/tiiuae/falcon-40b) يتطلب 2 \* 40 جيجا بايت = **80 جيجا بايت** VRAM
+-   [**MPT-30b**](https://huggingface.co/mosaicml/mpt-30b) يتطلب 2 \* 30 جيجا بايت = **60 جيجا بايت** VRAM
+-   [**bigcode/starcoder**](https://huggingface.co/bigcode/starcoder) يتطلب 2 \* 15.5 = **31 جيجا بايت** VRAM
+
+عند كتابة هذا الدليل، أكبر شريحة لوحدة معالجة الرسومات  المتوفّرة  هي  A100 و  H100  التي توفر 80 جيجابايت من ذاكرة الفيديو العشوائية (VRAM). تتطلب معظم النماذج المدرجة أعلاه أكثر من 80 جيجابايت فقط لتحميلها، وبالتالي فهي تتطلب بالضرورة [التوازي للموتّرات](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) و/أو [لتوازي  الخطي](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
+
+🤗 لا يدعم Transformers موازاة التنسور خارج الصندوق لأنه يتطلب كتابة هيكلة النموذج بطريقة محددة. إذا كنت مهتمًا بكتابة نماذج بطريقة صديقة لموازاة التنسور، فلا تتردد في إلقاء نظرة على [مكتبة الاستدلال بتوليد النص](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
+
+بدعم موازاة قنوات المعالجة البسيطة خارج الصندوق. للقيام بذلك، قم بتحميل النموذج باستخدام `device="auto"` والذي سيقوم تلقائيًا بوضع الطبقات المختلفة على وحدات معالجة الرسومات (GPU) المتاحة كما هو موضح [هنا](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
+لاحظ، مع ذلك، أنه في حين أن موازاة قنوات المعالجة البسيطة فعالة للغاية، إلا أنها لا تعالج مشكلات عدم نشاط وحدة معالجة الرسومات (GPU). لهذا، تكون موازاة قنوات المعالجة المتقدمة مطلوبة كما هو موضح [هنا](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
+
+إذا كان لديك حق الوصول إلى عقدة 8 x 80 جيجابايت A100، فيمكنك تحميل BLOOM كما يلي
+
+```bash
+!pip install transformers accelerate bitsandbytes optimum
+```
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", pad_token_id=0)
+```
+
+من خلال استخدام `device_map="auto"` سيتم توزيع طبقات الاهتمام بالتساوي عبر جميع وحدات معالجة الرسومات (GPU) المتاحة.
+
+في هذا الدليل، سنستخدم [bigcode/octocoder](https://huggingface.co/bigcode/octocoder) لأنه يمكن تشغيله على شريحة جهاز GPU A100 ذات 40 جيجا بايت. لاحظ أن جميع تحسينات الذاكرة والسرعة التي سنطبقها من الآن فصاعدًا تنطبق بالتساوي على النماذج التي تتطلب موازاة النماذج أو المصفوفات.
+
+نظرًا لأن النموذج مُحمَّل بدقة bfloat16، فباستخدام قاعدتنا الإرشادية أعلاه، نتوقع أن تكون متطلبات الذاكرة لتشغيل الاستدلال باستخدام `bigcode/octocoder` حوالي 31 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM). دعنا نجرب.
+
+نقوم أولاً بتحميل النموذج والمجزىء اللغوي ثم نقوم بتمرير كلاهما إلى كائن [قنوات المعالجة](https://huggingface.co/docs/transformers/main_classes/pipelines) في Transformers.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
+tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+```
+
+```python
+prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
+```
+
+رائع، يمكننا الآن استخدام النتيجة مباشرة لتحويل البايت إلى جيجا بايت.
+
+```python
+def bytes_to_giga_bytes(bytes):
+  return bytes / 1024 / 1024 / 1024
+```
+
+دعونا نستدعي [`torch.cuda.max_memory_allocated`](https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html) لقياس ذروة تخصيص ذاكرة وحدة معالجة الرسومات (GPU).
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```bash
+29.0260648727417
+```
+
+قريب بما يكفي من حسابنا التقريبي! يمكننا أن نرى أن الرقم غير صحيح تمامًا لأن الانتقال من البايت إلى الكيلوبايت يتطلب الضرب في 1024 بدلاً من 1000. لذلك يمكن أيضًا فهم صيغة التقريب على أنها حساب "بحد أقصى X جيجا بايت".
+لاحظ أنه إذا حاولنا تشغيل النموذج بدقة float32 الكاملة، فستكون هناك حاجة إلى 64 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM).
+
+> يتم تدريب جميع النماذج تقريبًا بتنسيق bfloat16 في الوقت الحالي، ولا يوجد سبب لتشغيل النموذج بدقة float32 الكاملة إذا [كانت وحدة معالجة الرسومات (GPU) الخاصة بك تدعم bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). لن توفر دقة float32 نتائج استدلال أفضل من الدقة التي تم استخدامها لتدريب النموذج.
+
+إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"torch_dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., torch_dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.
+
+
+دعونا نحدد وظيفة `flush(...)` لتحرير جميع الذاكرة المخصصة بحيث يمكننا قياس ذروة ذاكرة وحدة معالجة الرسومات (GPU) المخصصة بدقة.
+
+```python
+del pipe
+del model
+
+import gc
+import torch
+
+def flush():
+  gc.collect()
+  torch.cuda.empty_cache()
+  torch.cuda.reset_peak_memory_stats()
+```
+
+دعونا نستدعيه الآن للتجربة التالية.
+
+```python
+flush()
+```
+في الإصدار الأخير من مكتبة Accelerate، يمكنك أيضًا استخدام طريقة مساعدة تسمى `release_memory()`
+
+```python
+from accelerate.utils import release_memory
+# ...
+
+release_memory(model)
+```
+```python
+from accelerate.utils import release_memory
+# ...
+
+release_memory(model)
+```
+
+والآن ماذا لو لم يكن لدى وحدة معالجة الرسومات (GPU) لديك 32 جيجا بايت من ذاكرة الفيديو العشوائية (VRAM)؟ لقد وجد أن أوزان النماذج يمكن تحويلها إلى 8 بتات أو 4 بتات دون خسارة كبيرة في الأداء (انظر [Dettmers et al.](https://arxiv.org/abs/2208.07339)).
+يمكن تحويل النموذج إلى 3 بتات أو 2 بتات مع فقدان مقبول في الأداء كما هو موضح في ورقة [GPTQ](https://arxiv.org/abs/2210.17323) 🤯.
+
+دون الدخول في الكثير من التفاصيل، تهدف مخططات التكميم إلى تخفيض دقة الأوزان مع محاولة الحفاظ على دقة نتائج النموذج كما هي (*أي* أقرب ما يمكن إلى bfloat16).
+لاحظ أن التكميم يعمل بشكل خاص جيدًا لتوليد النص حيث كل ما نهتم به هو اختيار *مجموعة الرموز الأكثر احتمالًا التالية* ولا نهتم حقًا بالقيم الدقيقة لتوزيع الرمز التالي *logit*.
+كل ما يهم هو أن توزيع الرمز التالي *logit* يظل كما هو تقريبًا بحيث تعطي عملية `argmax` أو `topk` نفس النتائج.
+
+هناك عدة تقنيات للتكميم، والتي لن نناقشها بالتفصيل هنا، ولكن بشكل عام، تعمل جميع تقنيات التكميم كما يلي:
+
+-   1.  تكميم جميع الأوزان إلى الدقة المستهدفة
+-   2.  تحميل الأوزان المحولة، ومرر تسلسل المدخلات من المتجهات بتنسيق bfloat16
+-   3.  قم بتحويل الأوزان ديناميكيًا إلى bfloat1  لإجراء الحسابات مع متجهات المدخلات بدقة `bfloat16`
+
+باختصار، هذا يعني أن مضروبات *مصفوفة المدخلات-الوزن*، حيث \\( X \\) هي المدخلات، \\( W \\) هي مصفوفة وزن و \\( Y \\) هي الناتج:
+
+$$ Y = X * W $$
+
+تتغير إلى
+
+$$ Y = X * \text{dequantize}(W) $$
+
+لكل عملية ضرب المصفوفات. يتم تنفيذ إلغاء التكميم وإعادة التكميم بشكل متسلسل لجميع مصفوفات الأوزان أثناء مرور المدخلات عبر رسم الشبكة.
+
+لذلك، غالبًا ما لا يتم تقليل وقت الاستدلال عند استخدام الأوزان المكممة، ولكن بدلاً من ذلك يزيد.
+
+كفى نظرية، دعنا نجرب! لتكميم الأوزان باستخدام المحولات، تحتاج إلى التأكد من تثبيت مكتبة [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes).
+
+```bash
+!pip install bitsandbytes
+```
+
+يمكننا بعد ذلك تحميل النماذج في تكميم 8 بت ببساطة عن طريق إضافة علامة `load_in_8bit=True` إلى `from_pretrained`.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0)
+```
+
+الآن، دعنا نعيد تشغيل مثالنا ونقيس استخدام الذاكرة.
+
+```python
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
+```
+
+جميل، نحصل على نفس النتيجة كما في السابق، لذلك لا يوجد فقدان في الدقة! دعنا نلقي نظرة على مقدار الذاكرة المستخدمة هذه المرة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+15.219234466552734
+```
+
+أقل بكثير! لقد انخفضنا إلى ما يزيد قليلاً عن 15 جيجابايت، وبالتالي يمكننا تشغيل هذا النموذج على وحدات معالجة الرسومات للمستهلك مثل 4090.
+
+نرى مكسبًا لطيفًا جدًا في كفاءة الذاكرة ولا يوجد تقريبًا أي تدهور في ناتج النموذج. ومع ذلك، يمكننا أيضًا ملاحظة تباطؤ طفيف أثناء الاستدلال.
+
+نحذف النماذج ونفرغ الذاكرة مرة أخرى.
+```python
+del model
+del pipe
+```
+
+```python
+flush()
+```
+
+دعنا نرى ما هو استهلاك ذاكرة GPU الذروة الذي يوفره تكميم 4 بت. يمكن تكميم النموذج إلى 4 بت باستخدام نفس واجهة برمجة التطبيقات كما في السابق - هذه المرة عن طريق تمرير `load_in_4bit=True` بدلاً من `load_in_8bit=True`.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
+result
+```
+
+**الإخراج**:
+```
+Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
+```
+
+نحن نرى تقريبًا نفس نص الإخراج كما في السابق - فقط `python` مفقود قبل مقطع الكود. دعنا نرى مقدار الذاكرة المطلوبة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+9.543574333190918
+```
+
+فقط 9.5 جيجابايت! هذا ليس كثيرًا بالفعل لنموذج يزيد عدد معاملاته عن 15 مليار.
+
+على الرغم من أننا نرى تدهورًا بسيطًا جدًا في الدقة لنموذجنا هنا، إلا أن تكميم 4 بت يمكن أن يؤدي في الممارسة العملية غالبًا إلى نتائج مختلفة مقارنة بتكميم 8 بت أو الاستدلال الكامل `bfloat16`. الأمر متروك للمستخدم لتجربته.
+
+لاحظ أيضًا أن الاستدلال هنا كان أبطأ قليلاً مقارنة بتكميم 8 بت والذي يرجع إلى طريقة التكميم الأكثر عدوانية المستخدمة لتكميم 4 بت مما يؤدي إلى \\( \text{quantize} \\) و \\( \text{dequantize} \\) يستغرق وقتًا أطول أثناء الاستدلال.
+
+```python
+del model
+del pipe
+```
+```python
+flush()
+```
+
+بشكل عام، رأينا أن تشغيل OctoCoder بدقة 8 بت قلل من ذاكرة GPU VRAM المطلوبة من 32G GPU VRAM إلى 15 جيجابايت فقط، وتشغيل النموذج بدقة 4 بت يقلل من ذاكرة GPU VRAM المطلوبة إلى ما يزيد قليلاً عن 9 جيجابايت.
+
+يسمح تكميم 4 بت بتشغيل النموذج على وحدات معالجة الرسومات مثل RTX3090 و V100 و T4 والتي يمكن الوصول إليها بسهولة لمعظم الأشخاص.
+
+لمزيد من المعلومات حول التكميم ولمعرفة كيف يمكن تكميم النماذج لطلب ذاكرة GPU VRAM أقل حتى من 4 بت، نوصي بالاطلاع على تنفيذ [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60).
+
+> كاستنتاج، من المهم تذكر أن تكميم النموذج يتداول كفاءة الذاكرة المحسنة مقابل الدقة وفي بعض الحالات وقت الاستدلال.
+
+إذا لم تكن ذاكرة GPU قيدًا لحالتك الاستخدام، فغالبًا لا توجد حاجة للنظر في التكميم. ومع ذلك، لا يمكن للعديد من وحدات معالجة الرسومات ببساطة تشغيل نماذج اللغة الكبيرة بدون طرق التكميم وفي هذه الحالة، تعد مخططات التكميم 4 بت و 8 بت أدوات مفيدة للغاية.
+
+لمزيد من المعلومات حول الاستخدام التفصيلي، نوصي بشدة بإلقاء نظرة على [وثائق تكميم المحولات](https://huggingface.co/docs/transformers/main_classes/quantization#general-usage).
+
+بعد ذلك، دعنا نلقي نظرة على كيفية تحسين الكفاءة الحسابية وكفاءة الذاكرة باستخدام خوارزميات أفضل وبنية نموذج محسنة.
+
+## 2. الانتباه السريع
+
+تتشارك نماذج اللغة الكبيرة (LLMs) الأعلى أداءً اليوم تقريبًا نفس البنية الأساسية التي تتكون من طبقات التغذية الأمامية، وطبقات التنشيط، وطبقات التطبيع الطبقي، والأهم من ذلك، طبقات الانتباه الذاتي.
+
+تعد طبقات الانتباه الذاتي مركزية لنماذج اللغة الكبيرة (LLMs) حيث تمكن النموذج من فهم العلاقات السياقية بين رموز المدخلات.
+ومع ذلك، فإن استهلاك ذاكرة GPU الذروة لطبقات الانتباه الذاتي ينمو بشكل *رباعي* في كل من التعقيد الحسابي وتعقيد الذاكرة مع عدد رموز المدخلات (والذي يُطلق عليه أيضًا *طول التسلسل*) الذي نسميه في ما يلي بـ \\( N \\) .
+على الرغم من أن هذا غير ملحوظ حقًا للتسلسلات الأقصر (حتى 1000 رمز إدخال)، إلا أنه يصبح مشكلة خطيرة للتسلسلات الأطول (حوالي 16000 رمز إدخال).
+
+دعنا نلقي نظرة أقرب. الصيغة لحساب الناتج \\( \mathbf{O} \\) لطبقة الانتباه الذاتي لإدخال \\( \mathbf{X} \\) بطول \\( N \\) هي:
+
+$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$
+
+يعد \\( \mathbf{X} = (\mathbf{x}_1, ... \mathbf{x}_{N}) \\) بالتالي تسلسل الإدخال إلى طبقة الاهتمام. وستتكون كل من الإسقاطات \\( \mathbf{Q} \\) و \\( \mathbf{K} \\) من \\( N \\) من المتجهات مما يؤدي إلى أن يكون حجم \\( \mathbf{QK}^T \\) هو \\( N^2 \\).
+
+عادة ما يكون لدى LLMs العديد من رؤوس الاهتمام، وبالتالي يتم إجراء العديد من حسابات الاهتمام الذاتي بالتوازي.
+وبافتراض أن LLM لديها 40 رأس اهتمام وتعمل بدقة bfloat16، يمكننا حساب متطلبات الذاكرة لتخزين مصفوفات \\( \mathbf{QK^T} \\) لتكون \\( 40 * 2 * N^2 \\) بايت. بالنسبة لـ \\( N=1000 \\)، لا يلزم سوى حوالي 50 ميجابايت من VRAM، ولكن بالنسبة لـ \\( N=16000 \\) سنحتاج إلى 19 جيجابايت من VRAM، وبالنسبة لـ \\( N=100,000 \\) سنحتاج إلى ما يقرب من 1 تيرابايت فقط لتخزين مصفوفات \\( \mathbf{QK}^T \\).
+
+باختصار، سرعان ما يصبح خوارزمية الانتباه الذاتي الافتراضية مكلفة للغاية من حيث الذاكرة بالنسبة لسياقات الإدخال الكبيرة.
+
+مع تحسن LLMs في فهم النص وتوليد النص، يتم تطبيقها على مهام متزايدة التعقيد. في حين أن النماذج كانت تتعامل سابقًا مع ترجمة أو تلخيص بضع جمل، فإنها الآن تدير صفحات كاملة، مما يتطلب القدرة على معالجة أطوال إدخال واسعة.
+
+كيف يمكننا التخلص من متطلبات الذاكرة الباهظة للتطويلات المدخلة الكبيرة؟ نحن بحاجة إلى طريقة جديدة لحساب آلية الاهتمام الذاتي التي تتخلص من مصفوفة \\( QK^T \\). [طريقه داو وآخرون.](Https://arxiv.org/abs/2205.14135) طوروا بالضبط مثل هذا الخوارزمية الجديدة وأطلقوا عليها اسم **Flash Attention**.
+
+باختصار، يكسر الاهتمام الفلاشي حساب \\( \mathbf{V} \times \operatorname{Softmax}(\mathbf{QK}^T\\)) ويحسب بدلاً من ذلك قطعًا أصغر من الإخراج عن طريق التكرار عبر العديد من خطوات حساب Softmax:
+
+$$ \textbf{O}_i \leftarrow s^a_{ij} * \textbf{O}_i + s^b_{ij} * \mathbf{V}_{j} \times \operatorname{Softmax}(\mathbf{QK}^T_{i,j}) \text{ for multiple } i, j \text{ iterations } $$
+
+مع \\( s^a_{ij} \\) و \\( s^b_{ij} \\) كونها بعض إحصائيات التطبيع softmax التي يجب إعادة حسابها لكل \\( i \\) و \\( j \\).
+
+يرجى ملاحظة أن Flash Attention بالكامل أكثر تعقيدًا إلى حد ما ويتم تبسيطه بشكل كبير هنا حيث أن التعمق كثيرًا يخرج عن نطاق هذا الدليل. القارئ مدعو لإلقاء نظرة على ورقة Flash Attention المكتوبة جيدًا [1] لمزيد من التفاصيل.
+
+الفكرة الرئيسية هنا هي:
+
+> من خلال تتبع إحصائيات التطبيع softmax واستخدام بعض الرياضيات الذكية، يعطي Flash Attention **مخرجات متطابقة رقميًا** مقارنة بطبقة الاهتمام الذاتي الافتراضية بتكلفة ذاكرة لا تزيد خطيًا مع \\( N \\).
+
+عند النظر إلى الصيغة، قد يقول المرء بديهيًا أن الاهتمام الفلاشي يجب أن يكون أبطأ بكثير مقارنة بصيغة الاهتمام الافتراضية حيث يلزم إجراء المزيد من الحسابات. في الواقع، يتطلب Flash Attention المزيد من عمليات الفاصلة العائمة مقارنة بالاهتمام العادي حيث يجب إعادة حساب إحصائيات التطبيع softmax باستمرار (راجع [الورقة](https://arxiv.org/abs/2205.14135) لمزيد من التفاصيل إذا كنت مهتمًا)
+
+> ومع ذلك، فإن الاهتمام الفلاشي أسرع بكثير في الاستدلال مقارنة بالاهتمام الافتراضي الذي يأتي من قدرته على تقليل الطلبات على ذاكرة GPU الأبطأ ذات النطاق الترددي العالي (VRAM)، والتركيز بدلاً من ذلك على ذاكرة SRAM الأسرع الموجودة على الشريحة.
+
+من الناحية الأساسية، يتأكد Flash Attention من إمكانية إجراء جميع عمليات الكتابة والقراءة الوسيطة باستخدام ذاكرة SRAM السريعة الموجودة على الشريحة بدلاً من الاضطرار إلى الوصول إلى ذاكرة VRAM الأبطأ لحساب متجه الإخراج \\( \mathbf{O} \\).
+
+من الناحية العملية، لا يوجد حاليًا أي سبب **عدم** استخدام الاهتمام الفلاشي إذا كان متاحًا. الخوارزمية تعطي نفس المخرجات رياضيا، وأسرع وأكثر كفاءة في استخدام الذاكرة.
+
+لنلقِ نظرة على مثال عملي.
+
+
+يحصل نموذج OctoCoder الخاص بنا الآن على موجه إدخال أطول بشكل كبير يتضمن ما يسمى *موجه النظام*. تُستخدم موجهات النظام لتوجيه LLM إلى مساعد أفضل مصمم لمهام المستخدمين.
+فيما يلي، نستخدم موجه النظام الذي سيجعل OctoCoder مساعد ترميز أفضل.
+
+```python
+system_prompt = """Below are a series of dialogues between various people and an AI technical assistant.
+The assistant tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble but knowledgeable.
+The assistant is happy to help with code questions and will do their best to understand exactly what is needed.
+It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer.
+That said, the assistant is practical really does its best, and doesn't let caution get too much in the way of being useful.
+
+The Starcoder models are a series of 15.5B parameter models trained on 80+ programming languages from The Stack (v1.2) (excluding opt-out requests).
+The model uses Multi Query Attention, was trained using the Fill-in-the-Middle objective, and with 8,192 tokens context window for a trillion tokens of heavily deduplicated data.
+-----
+
+Question: Write a function that takes two lists and returns a list that has alternating elements from each input list.
+
+Answer: Sure. Here is a function that does that.
+
+def alternating(list1, list2):
+   results = []
+   for i in range(len(list1)):
+       results.append(list1[i])
+       results.append(list2[i])
+   return results
+
+Question: Can you write some test cases for this function?
+
+Answer: Sure, here are some tests.
+
+assert alternating([10, 20, 30], [1, 2, 3]) == [10, 1, 20, 2, 30, 3]
+assert alternating([True, False], [4, 5]) == [True, 4, False, 5]
+assert alternating([], []) == []
+
+Question: Modify the function so that it returns all input elements when the lists have uneven length. The elements from the longer list should be at the end.
+
+Answer: Here is the modified function.
+
+def alternating(list1, list2):
+   results = []
+   for i in range(min(len(list1), len(list2))):
+       results.append(list1[i])
+       results.append(list2[i])
+   if len(list1) > len(list2):
+       results.extend(list1[i+1:])
+   else:
+       results.extend(list2[i+1:])
+   return results
+-----
+"""
+```
+لأغراض التوضيح، سنكرر موجه النظام عشر مرات بحيث يكون طول الإدخال طويلاً بما يكفي لملاحظة وفورات ذاكرة Flash Attention.
+نضيف موجه النص الأصلي "سؤال: يرجى كتابة وظيفة في Python تقوم بتحويل البايتات إلى جيجا بايت.
+
+```python
+long_prompt = 10 * system_prompt + prompt
+```
+
+نقوم بتنفيذ نموذجنا مرة أخرى بدقة bfloat16.
+
+```python
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
+
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+```
+
+دعنا الآن نقوم بتشغيل النموذج تمامًا مثلما كان من قبل *بدون اهتمام فلاشي* وقياس متطلبات ذاكرة GPU وقت الذروة ووقت الاستدلال.
+
+```python
+import time
+
+start_time = time.time()
+result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
+
+print(f"Generated in {time.time() - start_time} seconds.")
+result
+```
+
+**الإخراج**:
+```
+تم التوليد في 10.96854019165039 ثانية.
+بالتأكيد. إليك وظيفة للقيام بذلك.
+
+def bytes_to_giga(bytes):
+return bytes / 1024 / 1024 / 1024
+
+الإجابة: بالتأكيد. إليك وظيفة للقيام بذلك.
+
+ديف
+```
+
+نحصل على نفس الإخراج كما كان من قبل، ولكن هذه المرة، يقوم النموذج بتكرار الإجابة عدة مرات حتى يتم قطعها عند 60 رمزًا. ليس من المستغرب أننا كررنا موجه النظام عشر مرات لأغراض التوضيح وبالتالي قمنا بتشغيل النموذج لتكرار نفسه.
+
+**ملاحظة** لا ينبغي تكرار موجه النظام عشر مرات في التطبيقات الواقعية - مرة واحدة كافية!
+
+دعنا نقيس متطلبات ذاكرة GPU وقت الذروة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+37.668193340301514
+```
+
+كما نرى، فإن متطلبات ذاكرة GPU وقت الذروة أعلى بكثير مما كانت عليه في البداية، وهو ما يرجع إلى حد كبير إلى تسلسل الإدخال الأطول. أيضًا، يستغرق التوليد أكثر من دقيقة بقليل الآن.
+
+نستدعي `flush()` لتحرير ذاكرة GPU لتجربتنا التالية.
+
+```python
+flush()
+```
+
+لمقارنة، دعونا نقوم بتشغيل نفس الدالة، ولكن تمكين الاهتمام فلاش بدلا من ذلك.
+للقيام بذلك، نقوم بتحويل النموذج إلى [BetterTransformer](Https://huggingface.co/docs/optimum/bettertransformer/overview) ومن خلال القيام بذلك تمكين PyTorch's [SDPA self-attention](Https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) والتي بدورها قادرة على استخدام الاهتمام فلاش.
+
+```python
+model.to_bettertransformer()
+```
+
+الآن نقوم بتشغيل نفس مقتطف التعليمات البرمجية بالضبط كما كان من قبل وتحت الغطاء سوف تستخدم المحولات الاهتمام فلاش.
+
+```py
+start_time = time.time()
+with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
+
+print(f"Generated in {time.time() - start_time} seconds.")
+result
+```
+
+**الإخراج**:
+```
+تم التوليد في 3.0211617946624756 ثانية.
+بالتأكيد. إليك وظيفة للقيام بذلك.
+
+def bytes_to_giga(bytes):
+return bytes / 1024 / 1024 / 1024
+
+الإجابة: بالتأكيد. إليك وظيفة للقيام بذلك.
+
+ديف
+```
+
+نحصل على نفس النتيجة بالضبط كما كان من قبل، ولكن يمكننا ملاحظة تسريع كبير بفضل الاهتمام فلاش.
+
+دعنا نقيس استهلاك الذاكرة لآخر مرة.
+
+```python
+bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
+```
+
+**الإخراج**:
+```
+32.617331981658936
+```
+
+ونحن تقريبا مرة أخرى إلى ذاكرة GPU الذروة الأصلية لدينا 29GB.
+
+يمكننا أن نلاحظ أننا نستخدم فقط حوالي 100 ميجابايت إضافية من ذاكرة GPU عند تمرير تسلسل إدخال طويل جدًا مع الاهتمام فلاش مقارنة بتمرير تسلسل إدخال قصير كما فعلنا في البداية.
+
+```py
+flush()
+```
+
+لمزيد من المعلومات حول كيفية استخدام Flash Attention، يرجى الاطلاع على [صفحة doc هذه](Https://huggingface.co/docs/transformers/en/perf_infer_gpu_one#flashattention-2).
+
+## 3. الابتكارات المعمارية
+
+حتى الآن، نظرنا في تحسين الكفاءة الحسابية والذاكرة من خلال:
+
+-   صب الأوزان في تنسيق دقة أقل
+-   استبدال خوارزمية الاهتمام الذاتي بإصدار أكثر كفاءة من حيث الذاكرة والحساب
+
+دعونا الآن نلقي نظرة على كيفية تغيير بنية LLM بحيث تكون أكثر فعالية وكفاءة للمهام التي تتطلب مدخلات نصية طويلة، على سبيل المثال:
+-   استرجاع الأسئلة المعززة،
+-   تلخيص،
+-   الدردشة
+
+لاحظ أن "الدردشة" لا تتطلب من LLM التعامل مع مدخلات نصية طويلة فحسب، بل تتطلب أيضًا أن يكون LLM قادرًا على التعامل بكفاءة مع الحوار ذهابًا وإيابًا بين المستخدم والمساعد (مثل ChatGPT).
+
+بمجرد تدريبها، يصبح من الصعب تغيير بنية LLM الأساسية، لذلك من المهم مراعاة مهام LLM مسبقًا وتحسين بنية النموذج وفقًا لذلك.
+هناك مكونان مهمان لبنية النموذج يصبحان بسرعة عنق زجاجة للذاكرة و/أو الأداء لتسلسلات الإدخال الكبيرة.
+
+-   الترميزات الموضعية
+-   ذاكرة التخزين المؤقت للقيمة الرئيسية
+
+دعنا نلقي نظرة على كل مكون بمزيد من التفاصيل
+
+### 3.1 تحسين الترميزات الموضعية لـ LLMs
+
+يضع الاهتمام الذاتي كل رمز في علاقة مع رموز أخرى.
+كمثال، يمكن أن تبدو مصفوفة \\( \operatorname{Softmax}(\mathbf{QK}^T) \\) لتسلسل الإدخال النصي *"مرحبًا"، "أنا"، "أحب"، "أنت"* كما يلي:
+
+![](/blog/assets/163_optimize_llm/self_attn_tokens.png)
+
+يتم منح كل رمز كلمة كتلة احتمال يتم من خلالها الاهتمام بجميع رموز الكلمات الأخرى، وبالتالي يتم وضعها في علاقة مع جميع رموز الكلمات الأخرى. على سبيل المثال، تحضر كلمة *"الحب"* كلمة *"مرحبًا"* بنسبة 5%، و *"أنا"* بنسبة 30%، ونفسها بنسبة 65%.
+
+سيواجه LLM القائم على الاهتمام الذاتي، ولكن بدون الترميزات الموضعية، صعوبات كبيرة في فهم مواضع نصوص الإدخال بالنسبة لبعضها البعض.
+ويرجع ذلك إلى أن درجة الاحتمال التي يحسبها \\( \mathbf{QK}^T \\) تربط كل رمز كلمة بكل رمز كلمة أخرى في حسابات \\( O (1) \\) بغض النظر عن مسافة الموضع النسبي بينهما.
+لذلك، بالنسبة إلى LLM بدون ترميزات موضعية، يبدو أن كل رمز له نفس المسافة إلى جميع الرموز الأخرى، على سبيل المثال، سيكون من الصعب التمييز بين *"مرحبًا أنا أحبك"* و *"أنت تحبني مرحبًا"*.
+
+لكي يفهم LLM ترتيب الجملة، يلزم وجود *إشارة* إضافية ويتم تطبيقها عادةً في شكل *الترميزات الموضعية* (أو ما يُطلق عليه أيضًا *الترميزات الموضعية*).
+لم يتم ترجمة النص الخاص والروابط وأكواد HTML وCSS بناءً على طلبك.
+
+قدم مؤلفو الورقة البحثية [*Attention Is All You Need*](https://arxiv.org/abs/1706.03762) تضمينات موضعية جيبية مثلثية \\( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \\) حيث يتم حساب كل متجه \\( \mathbf{p}_i \\) كدالة جيبية لموضعه \\( i \\) .
+بعد ذلك يتم ببساطة إضافة التضمينات الموضعية إلى متجهات تسلسل الإدخال \\( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \\) = \\( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \\) وبالتالي توجيه النموذج لتعلم ترتيب الجملة بشكل أفضل.
+
+بدلاً من استخدام التضمينات الموضعية الثابتة، استخدم آخرون (مثل [Devlin et al.](https://arxiv.org/abs/1810.04805)) تضمينات موضعية مكتسبة يتم من خلالها تعلم التضمينات الموضعية \\( \mathbf{P} \\) أثناء التدريب.
+
+كانت التضمينات الموضعية الجيبية والمكتسبة هي الطرق السائدة لترميز ترتيب الجملة في نماذج اللغة الكبيرة، ولكن تم العثور على بعض المشكلات المتعلقة بهذه التضمينات الموضعية:
+
+1. التضمينات الموضعية الجيبية والمكتسبة هي تضمينات موضعية مطلقة، أي ترميز تضمين فريد لكل معرف موضعي: \\( 0, \ldots, N \\) . كما أظهر [Huang et al.](https://arxiv.org/abs/2009.13658) و [Su et al.](https://arxiv.org/abs/2104.09864)، تؤدي التضمينات الموضعية المطلقة إلى أداء ضعيف لنماذج اللغة الكبيرة للمدخلات النصية الطويلة. بالنسبة للمدخلات النصية الطويلة، يكون من المفيد إذا تعلم النموذج المسافة الموضعية النسبية التي تمتلكها رموز المدخلات إلى بعضها البعض بدلاً من موضعها المطلق.
+2. عند استخدام التضمينات الموضعية المكتسبة، يجب تدريب نموذج اللغة الكبيرة على طول إدخال ثابت \\( N \\)، مما يجعل من الصعب الاستقراء إلى طول إدخال أطول مما تم تدريبه عليه.
+
+في الآونة الأخيرة، أصبحت التضمينات الموضعية النسبية التي يمكنها معالجة المشكلات المذكورة أعلاه أكثر شعبية، وأبرزها:
+
+-   [تضمين الموضع الدوراني (RoPE)](https://arxiv.org/abs/2104.09864)
+-   [ALiBi](https://arxiv.org/abs/2108.12409)
+
+يؤكد كل من *RoPE* و *ALiBi* أنه من الأفضل توجيه نموذج اللغة الكبيرة حول ترتيب الجملة مباشرة في خوارزمية الانتباه الذاتي حيث يتم وضع رموز الكلمات في علاقة مع بعضها البعض. على وجه التحديد، يجب توجيه ترتيب الجملة عن طريق تعديل عملية \\( \mathbf{QK}^T \\) .
+
+دون الدخول في الكثير من التفاصيل، يشير *RoPE* إلى أنه يمكن ترميز المعلومات الموضعية في أزواج الاستعلام-المفتاح، على سبيل المثال \\( \mathbf{q}_i \\) و \\( \mathbf{x}_j \\) عن طريق تدوير كل متجه بزاوية \\( \theta * i \\) و \\( \theta * j \\) على التوالي مع \\( i, j \\) تصف موضع الجملة لكل متجه:
+
+$$ \mathbf{\hat{q}}_i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}_{\theta, i -j} \mathbf{{x}}_j. $$
+
+يمثل \\( \mathbf{R}_{\theta, i - j} \\) مصفوفة دورانية. \\( \theta \\) *لا* يتم تعلمه أثناء التدريب، ولكن بدلاً من ذلك يتم تعيينه إلى قيمة محددة مسبقًا تعتمد على طول تسلسل الإدخال الأقصى أثناء التدريب.
+
+> من خلال القيام بذلك، يتم التأثير على درجة الاحتمال بين \\( \mathbf{q}_i \\) و \\( \mathbf{q}_j \\) فقط إذا \\( i \ne j \\) ويعتمد فقط على المسافة النسبية \\( i - j \\) بغض النظر عن المواضع المحددة لكل متجه \\( i \\) و \\( j \\) .
+
+يستخدم *RoPE* في العديد من نماذج اللغة الكبيرة الأكثر أهمية اليوم، مثل:
+
+-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
+-   [**Llama**](https://arxiv.org/abs/2302.13971)
+-   [**PaLM**](https://arxiv.org/abs/2204.02311)
+
+كبديل، يقترح *ALiBi* مخطط ترميز موضعي نسبي أبسط بكثير. يتم إضافة المسافة النسبية التي تمتلكها رموز المدخلات إلى بعضها البعض كعدد صحيح سلبي مقياس بقيمة محددة مسبقًا `m` إلى كل إدخال استعلام-مفتاح لمصفوفة \\( \mathbf{QK}^T \\) مباشرة قبل حساب softmax.
+
+![](/blog/assets/163_optimize_llm/alibi.png)
+
+كما هو موضح في ورقة [ALiBi](https://arxiv.org/abs/2108.12409)، يسمح هذا الترميز الموضعي النسبي البسيط للنموذج بالحفاظ على أداء عالٍ حتى في تسلسلات المدخلات النصية الطويلة جدًا.
+
+يُستخدم *ALiBi* في العديد من أهم نماذج اللغة الكبيرة المستخدمة اليوم، مثل:
+
+-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
+-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
+
+يمكن لكل من ترميزات الموضع *RoPE* و *ALiBi* الاستقراء إلى أطوال إدخال لم يتم ملاحظتها أثناء التدريب، في حين ثبت أن الاستقراء يعمل بشكل أفضل بكثير خارج الصندوق لـ *ALiBi* مقارنة بـ *RoPE*.
+بالنسبة لـ ALiBi، ما عليك سوى زيادة قيم مصفوفة الموضع المثلث السفلي لمطابقة طول تسلسل الإدخال.
+بالنسبة لـ *RoPE*، يؤدي الحفاظ على نفس \\( \theta \\) الذي تم استخدامه أثناء التدريب إلى نتائج سيئة عند تمرير إدخالات نصية أطول بكثير من تلك التي شوهدت أثناء التدريب، راجع [Press et al.](https://arxiv.org/abs/2108.12409). ومع ذلك، وجد المجتمع بعض الحيل الفعالة التي تقوم بتعديل \\( \theta \\)، مما يسمح لترميزات الموضع *RoPE* بالعمل بشكل جيد لتسلسلات إدخال النص المستقرئة (راجع [هنا](https://github.com/huggingface/transformers/pull/24653)).
+
+> كل من RoPE و ALiBi عبارة عن ترميزات موضع نسبي *لا* يتم تعلمها أثناء التدريب، ولكن بدلاً من ذلك تستند إلى الحدس التالي:
+ -   يجب إعطاء الإشارات الموضعية حول إدخالات النص مباشرة إلى مصفوفة \\( QK^T \\) لطبقة الاهتمام الذاتي
+ -   يجب تحفيز LLM لتعلم ترميزات موضعية ثابتة *نسبية* المسافة لبعضها البعض
+ -   كلما ابتعدت رموز إدخال النص عن بعضها البعض، انخفض احتمال الاستعلام والقيمة. كل من RoPE و ALiBi يقللان من احتمال الاستعلام والمفتاح للرموز البعيدة عن بعضها البعض. يقوم RoPE بذلك عن طريق تقليل منتج المتجه من خلال زيادة الزاوية بين متجهات الاستعلام والمفتاح. تضيف ALiBi أرقامًا كبيرة سالبة إلى المنتج الاتجاهي
+
+في الختام، من الأفضل تدريب نماذج اللغة الكبيرة المراد نشرها في مهام تتطلب التعامل مع إدخالات نصية كبيرة باستخدام ترميزات موضعية نسبية، مثل RoPE و ALiBi. لاحظ أيضًا أنه حتى إذا تم تدريب نموذج لغة كبيرة باستخدام RoPE و ALiBi على طول ثابت يبلغ، على سبيل المثال، \\( N_1 = 2048 \\)، فيمكن استخدامه عمليًا بإدخالات نصية أكبر بكثير من \\( N_1 \\)، مثل \\( N_2 = 8192> N_1 \\) عن طريق استقراء الترميزات الموضعية.
+
+### 3.2 ذاكرة التخزين المؤقت للمفتاح والقيمة
+
+تعمل عملية توليد النص ذاتي التراجع باستخدام نماذج اللغة الكبيرة عن طريق إدخال تسلسل إدخال بشكل تكراري، وأخذ عينات من الرمز التالي، وإلحاق الرمز التالي بتسلسل الإدخال، والاستمرار في ذلك حتى ينتج نموذج اللغة الكبيرة رمزًا يشير إلى انتهاء التوليد.
+
+يرجى الاطلاع على [دليل إنشاء النص الخاص بـ Transformer](https://huggingface.co/docs/transformers/llm_tutorial#generate-text) للحصول على شرح مرئي أفضل لكيفية عمل التوليد ذاتي التراجع.
+
+دعنا ننفذ مقتطفًا قصيرًا من التعليمات البرمجية لإظهار كيفية عمل التوليد ذاتي التراجع في الممارسة. ببساطة، سنأخذ الرمز الأكثر احتمالًا عبر `torch.argmax`.
+
+```python
+input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+
+for _ in range(5):
+  next_logits = model(input_ids)["logits"][:, -1:]
+  next_token_id = torch.argmax(next_logits,dim=-1)
+
+  input_ids = torch.cat([input_ids, next_token_id], dim=-1)
+  print("shape of input_ids", input_ids.shape)
+
+generated_text = tokenizer.batch_decode(input_ids[:, -5:])
+generated_text
+```
+
+**الإخراج**:
+```
+shape of input_ids torch.Size([1, 21])
+shape of input_ids torch.Size([1, 22])
+shape of input_ids torch.Size([1, 23])
+shape of input_ids torch.Size([1, 24])
+shape of input_ids torch.Size([1, 25])
+[' Here is a Python function']
+```
+
+كما نرى، في كل مرة نزيد من رموز إدخال النص بالرمز الذي تم أخذ عينات منه للتو.
+
+باستثناءات قليلة جدًا، يتم تدريب نماذج اللغة الكبيرة باستخدام [هدف نمذجة اللغة السببية](https://huggingface.co/docs/transformers/tasks/language_modeling#causal-language-modeling) وبالتالي يتم قناع المثلث العلوي لمصفوفة نتيجة الاهتمام - وهذا هو السبب في ترك نتائج الاهتمام فارغة (*أي لها احتمال 0*) في المخططين أعلاه. للحصول على ملخص سريع حول نمذجة اللغة السببية، يمكنك الرجوع إلى مدونة [*Illustrated Self Attention*](https://jalammar.github.io/illustrated-gpt2/#part-2-illustrated-self-attention).
+
+ونتيجة لذلك، *لا* تعتمد الرموز *أبدًا* على الرموز السابقة، وبشكل أكثر تحديدًا، لا يتم أبدًا وضع المتجه \\( \mathbf{q}_i \\) في علاقة مع أي متجهات المفاتيح والقيم \\( \mathbf{k}_j، \mathbf{v}_j \\) إذا \\( j> i \\). بدلاً من ذلك، يحضر \\( \mathbf{q}_i \\) فقط إلى متجهات المفاتيح والقيم السابقة \\( \mathbf{k}_{m < i}، \mathbf{v}_{m < i} \text{ , for } m \in \{0، \ ldots i - 1\} \\). لتقليل الحسابات غير الضرورية، يمكن تخزين ذاكرة التخزين المؤقت لكل طبقة للمفاتيح ومتجهات القيم لجميع الخطوات الزمنية السابقة.
+
+فيما يلي، سنطلب من نموذج اللغة الكبيرة استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم عن طريق استردادها وإرسالها لكل عملية توجيه.
+في Transformers، يمكننا استرداد ذاكرة التخزين المؤقت للمفاتيح والقيم عن طريق تمرير علم `use_cache` إلى مكالمة `forward` ويمكننا بعد ذلك تمريره مع الرمز الحالي.
+
+```python
+past_key_values = None # past_key_values is the key-value cache
+generated_tokens = []
+next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+
+for _ in range(5):
+  next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple()
+  next_logits = next_logits[:, -1:]
+  next_token_id = torch.argmax(next_logits, dim=-1)
+
+  print("shape of input_ids", next_token_id.shape)
+  print("length of key-value cache", len(past_key_values[0][0]))  # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim]
+  generated_tokens.append(next_token_id.item())
+
+generated_text = tokenizer.batch_decode(generated_tokens)
+generated_text
+```
+
+**الإخراج**:
+```
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 20
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 21
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 22
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 23
+shape of input_ids torch.Size([1, 1])
+length of key-value cache 24
+[' Here', ' is', ' a', ' Python', ' function']
+```
+
+كما هو موضح، عند استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم، لا يتم زيادة رموز إدخال النص في الطول، ولكنها تظل متجه إدخال واحدًا. من ناحية أخرى، يتم زيادة طول ذاكرة التخزين المؤقت للمفاتيح والقيم بواحد في كل خطوة فك التشفير.
+
+> يعني استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم أن \\( \mathbf{QK}^T \\) يتم تقليله بشكل أساسي إلى \\( \mathbf{q}_c\mathbf{K}^T \\) مع \\( \mathbf{q}_c \\) كونها إسقاط الاستعلام للرمز المدخل الحالي الذي يكون *دائمًا* مجرد متجه واحد.
+
+لاستخدام ذاكرة التخزين المؤقت للمفاتيح والقيم ميزتان:
+-   زيادة كبيرة في الكفاءة الحسابية حيث يتم إجراء حسابات أقل مقارنة بحساب مصفوفة \\( \mathbf{QK}^T \\) الكاملة. يؤدي ذلك إلى زيادة سرعة الاستدلال
+-   لا تزداد الذاكرة القصوى المطلوبة بشكل تربيعي مع عدد الرموز المولدة، ولكنها تزداد بشكل خطي فقط.
+
+> يجب *دائمًا* استخدام ذاكرة التخزين المؤقت للمفاتيح والقيم حيث يؤدي ذلك إلى نتائج متطابقة وزيادة كبيرة في السرعة لتسلسلات الإدخال الأطول. ذاكرة التخزين المؤقت للمفاتيح والقيم ممكّنة بشكل افتراضي في Transformers عند استخدام خط أنابيب النص أو طريقة [`generate`](https://huggingface.co/docs/transformers/main_classes/text_generation).
+
+
+<Tip warning={true}>
+
+لاحظ أنه على الرغم من نصيحتنا باستخدام ذاكرة التخزين المؤقت للمفاتيح والقيم، فقد يكون إخراج نموذج اللغة الكبيرة مختلفًا قليلاً عند استخدامها. هذه خاصية نوى ضرب المصفوفة نفسها - يمكنك قراءة المزيد عنها [هنا](https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
+
+</Tip>
+
+#### 3.2.1 محادثة متعددة الجولات
+
+ذاكرة التخزين المؤقت للمفاتيح والقيم مفيدة بشكل خاص للتطبيقات مثل الدردشة حيث تكون هناك حاجة إلى عدة تمريرات من فك التشفير ذاتي التراجع. دعنا نلقي نظرة على مثال.
+
+```
+المستخدم: كم عدد الأشخاص الذين يعيشون في فرنسا؟
+المساعد: يعيش حوالي 75 مليون شخص في فرنسا
+المستخدم: وكم عدد الأشخاص في ألمانيا؟
+المساعد: يوجد في ألمانيا حوالي 81 مليون نسمة
+
+User: How many people live in France?
+Assistant: Roughly 75 million people live in France
+User: And how many are in Germany?
+Assistant: Germany has ca. 81 million inhabitants
+```
+
+In this chat، يقوم LLM بتشغيل فك التشفير التلقائي مرتين:
+  1. المرة الأولى، تكون ذاكرة التخزين المؤقت key-value فارغة، ويكون موجه الإدخال هو "User: How many people live in France؟" ويقوم النموذج بإنشاء النص "Roughly 75 million people live in France" بشكل تلقائي أثناء زيادة ذاكرة التخزين المؤقت key-value في كل خطوة فك تشفير.
+  2. في المرة الثانية، يكون موجه الإدخال هو "User: How many people live in France؟ \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany؟". بفضل ذاكرة التخزين المؤقت، يتم بالفعل حساب جميع متجهات القيمة الرئيسية لجاريتين الأولى. لذلك يتكون موجه الإدخال فقط من "User: And how many in Germany؟". أثناء معالجة موجه الإدخال المختصر، يتم ربط متجهات القيمة المحسوبة بذاكرة التخزين المؤقت key-value الخاصة بفك التشفير الأول. يتم بعد ذلك إنشاء إجابة المساعد الثانية "Germany has ca. 81 million inhabitants" بشكل تلقائي باستخدام ذاكرة التخزين المؤقت key-value المكونة من متجهات القيمة المشفرة لـ "User: How many people live in France؟ \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany؟".
+
+يجب ملاحظة أمرين هنا:
+  1. الحفاظ على كل السياق أمر بالغ الأهمية للنماذج اللغوية الكبيرة (LLMs) التي يتم نشرها في الدردشة بحيث يفهم LLM كل سياق المحادثة السابق. على سبيل المثال، بالنسبة للمثال أعلاه، يحتاج LLM إلى فهم أن المستخدم يشير إلى السكان عند السؤال "And how many are in Germany؟".
+  2. ذاكرة التخزين المؤقت key-value مفيدة للغاية للدردشة حيث تتيح لنا النمو المستمر لتاريخ الدردشة المشفرة بدلاً من الاضطرار إلى إعادة تشفير تاريخ الدردشة من البداية (كما هو الحال، على سبيل المثال، عند استخدام بنية ترميز فك التشفير).
+
+في `transformers`، ستعيد مكالمة `generate` `past_key_values` عندما يتم تمرير `return_dict_in_generate=True`، بالإضافة إلى `use_cache=True` الافتراضي. لاحظ أنه غير متوفر بعد من خلال واجهة `pipeline`.
+
+```python
+# Generation as usual
+prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"
+model_inputs = tokenizer(prompt، return_tensors='pt')
+generation_output = model.generate(**model_inputs، max_new_tokens=60، return_dict_in_generate=True)
+decoded_output = tokenizer.batch_decode(generation_output.sequences)[0]
+
+# Piping the returned `past_key_values` to speed up the next conversation round
+prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here"
+model_inputs = tokenizer(prompt، return_tensors='pt')
+generation_output = model.generate(
+  **model_inputs،
+  past_key_values=generation_output.past_key_values،
+  max_new_tokens=60،
+  return_dict_in_generate=True
+)
+tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]
+```
+
+**الإخراج**:
+```
+ هي نسخة معدلة من الدالة التي تعيد ميجا بايت بدلاً من ذلك.
+
+def bytes_to_megabytes(bytes):
+   return bytes / 1024 / 1024
+
+Answer: The function takes a number of bytes as input and returns the number of
+```
+
+رائع، لا يتم إنفاق وقت إضافي على إعادة حساب نفس المفتاح والقيم لطبقة الاهتمام! ومع ذلك، هناك شيء واحد يجب ملاحظته. في حين أن ذروة الذاكرة المطلوبة لمصفوفة \\( \mathbf{QK}^T \\) يتم تقليلها بشكل كبير، فإن الاحتفاظ بذاكرة التخزين المؤقت key-value في الذاكرة يمكن أن يصبح مكلفًا جدًا من حيث الذاكرة لسلاسل الإدخال الطويلة أو الدردشة متعددة الجولات. تذكر أن ذاكرة التخزين المؤقت key-value بحاجة إلى تخزين متجهات القيمة الرئيسية لجميع متجهات الإدخال السابقة \\( \mathbf{x}_i \text{، لـ } i \in \{1، \ ldots، c - 1\} \\) لجميع طبقات الاهتمام الذاتي وكل رؤوس الاهتمام.
+
+دعنا نحسب عدد القيم العائمة التي يجب تخزينها في ذاكرة التخزين المؤقت key-value لنموذج LLM `bigcode/octocoder` الذي استخدمناه من قبل.
+يبلغ عدد القيم العائمة ضعف طول التسلسل مضروبًا في عدد رؤوس الاهتمام مضروبًا في بعد رأس الاهتمام ومضروبًا في عدد الطبقات.
+حساب هذا لنموذج LLM لدينا عند طول تسلسل افتراضي يبلغ 16000 يعطي:
+
+```python
+config = model.config
+2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head
+```
+
+**الإخراج**:
+```
+7864320000
+```
+
+Roughly 8 مليار قيمة عائمة! يتطلب تخزين 8 مليارات قيمة عائمة في دقة `float16` حوالي 15 جيجابايت من ذاكرة الوصول العشوائي (RAM) وهو ما يقرب من نصف حجم أوزان النموذج نفسها!
+اقترح الباحثون طريقتين تسمحان بتقليل تكلفة الذاكرة لتخزين ذاكرة التخزين المؤقت key-value بشكل كبير، والتي يتم استكشافها في الأقسام الفرعية التالية.
+
+#### 3.2.2 Multi-Query-Attention (MQA)
+
+[Multi-Query-Attention](https://arxiv.org/abs/1911.02150) اقترحها Noam Shazeer في ورقته *Fast Transformer Decoding: One Write-Head is All You Need*. كما يقول العنوان، اكتشف Noam أنه بدلاً من استخدام `n_head` من أوزان إسقاط القيمة الرئيسية، يمكن استخدام زوج واحد من أوزان إسقاط رأس القيمة التي يتم مشاركتها عبر جميع رؤوس الاهتمام دون أن يتدهور أداء النموذج بشكل كبير.
+
+> باستخدام زوج واحد من أوزان إسقاط رأس القيمة، يجب أن تكون متجهات القيمة الرئيسية \\( \mathbf{k}_i، \mathbf{v}_i \\) متطابقة عبر جميع رؤوس الاهتمام والتي بدورها تعني أننا بحاجة فقط إلى تخزين زوج إسقاط قيمة رئيسي واحد في ذاكرة التخزين المؤقت بدلاً من `n_head` منها.
+
+نظرًا لأن معظم LLMs تستخدم ما بين 20 و100 رأس اهتمام، فإن MQA يقلل بشكل كبير من استهلاك الذاكرة لذاكرة التخزين المؤقت key-value. بالنسبة إلى LLM المستخدم في هذا الدفتر، يمكننا تقليل استهلاك الذاكرة المطلوبة من 15 جيجابايت إلى أقل من 400 ميجابايت عند طول تسلسل الإدخال 16000.
+
+بالإضافة إلى توفير الذاكرة، يؤدي MQA أيضًا إلى تحسين الكفاءة الحسابية كما هو موضح في ما يلي.
+في فك التشفير التلقائي، يجب إعادة تحميل متجهات القيمة الرئيسية الكبيرة، ودمجها مع زوج متجه القيمة الحالي، ثم إدخالها في \\( \mathbf{q}_c\mathbf{K}^T \\) الحساب في كل خطوة. بالنسبة لفك التشفير التلقائي، يمكن أن تصبح عرض النطاق الترددي للذاكرة المطلوبة لإعادة التحميل المستمر عنق زجاجة زمنيًا خطيرًا. من خلال تقليل حجم متجهات القيمة الرئيسية، يجب الوصول إلى ذاكرة أقل، وبالتالي تقليل عنق الزجاجة في عرض النطاق الترددي للذاكرة. لمزيد من التفاصيل، يرجى إلقاء نظرة على [ورقة Noam](https://arxiv.org/abs/1911.02150).
+
+الجزء المهم الذي يجب فهمه هنا هو أن تقليل عدد رؤوس الاهتمام بالقيمة الرئيسية إلى 1 لا معنى له إلا إذا تم استخدام ذاكرة التخزين المؤقت للقيمة الرئيسية. يظل الاستهلاك الذروي لذاكرة النموذج لمرور واحد للأمام بدون ذاكرة التخزين المؤقت للقيمة الرئيسية دون تغيير لأن كل رأس اهتمام لا يزال لديه متجه استعلام فريد بحيث يكون لكل رأس اهتمام مصفوفة \\( \mathbf{QK}^T \\) مختلفة.
+
+شهدت MQA اعتمادًا واسع النطاق من قبل المجتمع ويتم استخدامها الآن بواسطة العديد من LLMs الأكثر شهرة:
+
+-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
+-   [**PaLM**](https://arxiv.org/abs/2204.02311)
+-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
+-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
+
+كما يستخدم نقطة التحقق المستخدمة في هذا الدفتر - `bigcode/octocoder` - MQA.
+
+#### 3.2.3 مجموعة الاستعلام الاهتمام (GQA)
+
+[مجموعة الاستعلام الاهتمام](https://arxiv.org/abs/2305.13245)، كما اقترح Ainslie et al. من Google، وجد أن استخدام MQA يمكن أن يؤدي غالبًا إلى تدهور الجودة مقارنة باستخدام إسقاطات رأس القيمة الرئيسية المتعددة. تجادل الورقة بأنه يمكن الحفاظ على أداء النموذج بشكل أكبر عن طريق تقليل عدد أوزان إسقاط رأس الاستعلام بشكل أقل حدة. بدلاً من استخدام وزن إسقاط قيمة رئيسية واحدة فقط، يجب استخدام `n <n_head` أوزان إسقاط قيمة رئيسية. من خلال اختيار `n` إلى قيمة أقل بكثير من `n_head`، مثل 2 أو 4 أو 8، يمكن الاحتفاظ بمعظم مكاسب الذاكرة والسرعة من MQA مع التضحية بقدر أقل من سعة النموذج وبالتالي، من المفترض، أقل أداء.
+
+علاوة على ذلك، اكتشف مؤلفو GQA أنه يمكن *تدريب* نقاط تفتيش النموذج الموجودة ليكون لها بنية GQA باستخدام 5% فقط من الحوسبة الأصلية للتعليم المسبق. في حين أن 5% من الحوسبة الأصلية للتعليم المسبق يمكن أن تكون كمية هائلة، يسمح GQA *uptraining* بنقاط تفتيش موجودة للاستفادة من تسلسلات الإدخال الأطول.
+
+تم اقتراح GQA مؤخرًا فقط، ولهذا السبب هناك اعتماد أقل وقت كتابة هذا الدفتر.
+أبرز تطبيق لـ GQA هو [Llama-v2](https://huggingface.co/meta-llama/Llama-2-70b-hf).
+
+> كخاتمة، من المستحسن بشدة استخدام GQA أو MQA إذا تم نشر LLM باستخدام فك التشفير التلقائي ويتطلب التعامل مع تسلسلات الإدخال الكبيرة كما هو الحال على سبيل المثال للدردشة.
+
+
+## الخاتمة
+
+مجتمع البحث يأتي باستمرار بطرق جديدة ومبتكرة لتسريع وقت الاستدلال للنماذج اللغوية الكبيرة على الإطلاق. كمثال، أحد اتجاهات البحث الواعدة هو [فك التشفير التخميني](https://arxiv.org/abs/2211.17192) حيث تقوم "الرموز السهلة" بإنشائها نماذج اللغة الأصغر والأسرع ويتم إنشاء "الرموز الصعبة" فقط بواسطة LLM نفسه. إن التعمق في التفاصيل يتجاوز نطاق هذا الدفتر، ولكن يمكن قراءته في هذه [تدوينة المدونة اللطيفة](https://huggingface.co/blog/assisted-generation).
+
+السبب في أن LLMs الضخمة مثل GPT3/4، وLlama-2-70b، وClaude، وPaLM يمكن أن تعمل بسرعة كبيرة في واجهات الدردشة مثل [Hugging Face Chat](https://huggingface.co/chat/) أو ChatGPT يرجع إلى حد كبير إلى التحسينات المذكورة أعلاه في الدقة والخوارزميات والهندسة المعمارية.
+في المستقبل، ستكون أجهزة التسريع مثل وحدات معالجة الرسومات (GPUs) ووحدات معالجة الرسومات (TPUs)، وما إلى ذلك... ستكون أسرع فقط وستسمح بمزيد من الذاكرة، ولكن يجب دائمًا التأكد من استخدام أفضل الخوارزميات والهندسة المعمارية المتاحة للحصول على أكبر قدر من المال
diff --git a/docs/transformers/docs/source/ar/model_memory_anatomy.md b/docs/transformers/docs/source/ar/model_memory_anatomy.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fc2552321b79944d07bc0b35045654e0e8b3899
--- /dev/null
+++ b/docs/transformers/docs/source/ar/model_memory_anatomy.md
@@ -0,0 +1,226 @@
+# تشريح عملية تدريب النموذج
+
+لفهم تقنيات تحسين الأداء التي يمكن تطبيقها لتحسين كفاءة استخدام الذاكرة وسرعة تدريب النموذج، من المفيد التعرف على كيفية استخدام وحدة معالجة الرسوميات (GPU) أثناء التدريب، وكيف تختلف كثافة العمليات الحسابية باختلاف العملية التي يتم تنفيذها.
+
+لنبدأ باستكشاف مثال توضيحي على استخدام وحدة GPU وتشغيل تدريب نموذج. وللتوضيح، سنحتاج إلى تثبيت بعض المكتبات:
+
+```bash
+pip install transformers datasets accelerate nvidia-ml-py3
+```
+
+تتيح مكتبة `nvidia-ml-py3` إمكانية مراقبة استخدام الذاكرة في النماذج من داخل بايثون. قد تكون على دراية بأمر `nvidia-smi` في الجهاز - تسمح هذه المكتبة بالوصول إلى نفس المعلومات مباشرة في بايثون.
+
+ثم، نقوم بإنشاء بعض البيانات الوهمية:معرّفات رموز عشوائية بين 100 و30000 وتصنيفات ثنائية للمصنف.
+
+في المجموع، نحصل على 512 تسلسلًا، لكل منها طول 512، ونخزنها في [`~datasets.Dataset`] بتنسيق PyTorch.
+
+```py
+>>> import numpy as np
+>>> from datasets import Dataset
+
+>>> seq_len, dataset_size = 512, 512
+>>> dummy_data = {
+...     "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
+...     "labels": np.random.randint(0, 1, (dataset_size)),
+... }
+>>> ds = Dataset.from_dict(dummy_data)
+>>> ds.set_format("pt")
+```
+
+لطباعة إحصائيات موجزة لاستخدام وحدة GPU وتشغيل التدريب مع [`Trainer`]، نقوم بتعريف دالتين مساعدتين:
+
+```py
+>>> from pynvml import *
+
+>>> def print_gpu_utilization():
+...     nvmlInit()
+...     handle = nvmlDeviceGetHandleByIndex(0)
+...     info = nvmlDeviceGetMemoryInfo(handle)
+...     print(f"GPU memory occupied: {info.used//1024**2} MB.")
+
+>>> def print_summary(result):
+...     print(f"Time: {result.metrics['train_runtime']:.2f}")
+...     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+...     print_gpu_utilization()
+```
+
+دعنا نتأكد من أننا نبدأ بذاكرة وحدة GPU خالية:
+
+```py
+>>> print_gpu_utilization()
+GPU memory occupied: 0 MB.
+```
+
+يبدو ذلك جيدًا: لم يتم شغل ذاكرة وحدة معالجة الرسومات كما نتوقع قبل تحميل أي نماذج. إذا لم يكن الأمر كذلك على جهازك، فتأكد من إيقاف جميع العمليات التي تستخدم ذاكرة وحدة GPU. ومع ذلك، لا يمكن للمستخدم استخدام كل ذاكرة وحدة GPU الفارغة. عندما يتم تحميل نموذج إلى وحدة GPU، يتم أيضًا تحميل النواة، والتي يمكن أن تستهلك 1-2 جيجابايت من الذاكرة. ولرؤية مقدار ذلك، نقوم بتحميل مصفوفة صغيرة إلى وحدة GPU والتي تؤدي إلى تحميل النواة أيضًا.
+
+```py
+>>> import torch
+
+>>> torch.ones((1, 1)).to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 1343 MB.
+```
+
+نلاحظ أن النواة وحدها تستهلك 1.3 جيجابايت من ذاكرة وحدة GPU. الآن دعنا نرى مقدار المساحة التي يستخدمها النموذج.
+
+## تحميل النموذج
+
+أولاً، نقوم بتحميل نموذج `google-bert/bert-large-uncased`. نقوم بتحميل أوزان النموذج مباشرة إلى وحدة GPU حتى نتمكن من التحقق من مقدار المساحة التي تستخدمها الأوزان فقط.
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
+>>> print_gpu_utilization()
+GPU memory occupied: 2631 MB.
+```
+
+يمكننا أن نرى أن أوزان النموذج وحدها تستهلك 1.3 جيجابايت من ذاكرة وحدة GPU. يعتمد الرقم الدقيق على وحدة GPU المحددة التي تستخدمها. لاحظ أنه في وحدات GPU الأحدث، قد يستغرق النموذج في بعض الأحيان مساحة أكبر نظرًا لأن الأوزان يتم تحميلها بطريقة مُحسّنة تُسرّع من استخدام النموذج. الآن يمكننا أيضًا التحقق بسرعة مما إذا كنا نحصل على نفس النتيجة كما هو الحال مع `nvidia-smi` CLI:
+
+```bash
+nvidia-smi
+```
+
+```bash
+Tue Jan 11 08:58:05 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  Tesla V100-SXM2...  On   | 00000000:00:04.0 Off |                    0 |
+| N/A   37C    P0    39W / 300W |   2631MiB / 16160MiB |      0%      Default |
+|                               |                      |                  N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      3721      C   ...nvs/codeparrot/bin/python     2629MiB |
++-----------------------------------------------------------------------------+
+```
+
+نحصل على نفس الرقم كما كان من قبل، ويمكنك أيضًا أن ترى أننا نستخدم GPU من طراز V100 مع 16 جيجابايت من الذاكرة. لذا الآن يمكننا بدء تدريب النموذج ورؤية كيف يتغير استخدام ذاكرة GPU. أولاً، نقوم بإعداد بعض معاملات التدريب القياسية:
+
+```py
+default_args = {
+    "output_dir": "tmp"،
+    "eval_strategy": "steps"،
+    "num_train_epochs": 1،
+    "log_level": "error"،
+    "report_to": "none"،
+}
+```
+
+<Tip>
+
+ إذا كنت تخطط لتشغيل عدة تجارب، من أجل مسح الذاكرة بشكل صحيح بين التجارب، قم بإعادة تشغيل نواة Python بين التجارب.
+
+</Tip>
+
+## استخدام الذاكرة في التدريب الأساسي
+
+دعونا نستخدم [`Trainer`] وقم بتدريب النموذج دون استخدام أي تقنيات تحسين أداء GPU وحجم دفعة يبلغ 4:
+
+```py
+>>> from transformers import TrainingArguments، Trainer، logging
+
+>>> logging.set_verbosity_error()
+
+
+>>> training_args = TrainingArguments(per_device_train_batch_size=4، **default_args)
+>>> trainer = Trainer(model=model، args=training_args، train_dataset=ds)
+>>> result = trainer.train()
+>>> print_summary(result)
+```
+
+```
+الوقت: 57.82
+العينات / الثانية: 8.86
+ذاكرة GPU المشغولة: 14949 ميجابايت.
+```
+
+يمكننا أن نرى أن حجم دفعة صغير نسبيًا يملأ تقريبًا ذاكرة GPU بالكامل. ومع ذلك، غالبًا ما يؤدي حجم دفعة أكبر  في تقارب نموذج أسرع أو أداء أفضل في النهاية. لذلك نريد أن نضبط حجم الدفعة وفقًا لاحتياجات النموذج لدينا وليس  مع قيود وحدة GPU. ما يثير الاهتمام هو أننا نستخدم ذاكرة أكثر بكثير من حجم النموذج. 
+لفهم سبب ذلك بشكل أفضل، دعنا نلقي نظرة على عمليات النموذج واحتياجاته من الذاكرة.
+
+## تشريح عمليات النموذج
+
+تتضمن بنية المحولات 3 مجموعات رئيسية من العمليات مُجمعة أدناه حسب كثافة العمليات الحسابية.
+
+1. **عمليات ضرب المصفوفات**
+
+    تقوم الطبقات الخطية ومكونات الانتباه متعدد الرؤوس جميعها بعمليات ضرب ** المصفوفة بالمصفوفة** على دفعات. هذه العمليات هي أكثر أجزاء تدريب المحولات كثافة من الناحية الحسابية.
+
+2. **عمليات التسوية الإحصائية**
+
+تُعد  عمليات  Softmax والتسوية الطبقية أقل كثافة من ناحية  الحسابية من عمليات ضرب المصفوفات، وتنطوي على عملية أو أكثر من عمليات **الاختزال**، والتي يتم تطبيق نتيجتها بعد ذلك عبر خريطة.
+
+3. **العمليات على مستوى العناصر**
+
+    هذه هي العمليات المتبقية: **الانحيازات، والتسرب، ووظائف التنشيط، والوصلات المتبقية**. هذه هي عمليات أقل كثافة من الناحية الحسابية.
+
+يمكن أن تكون هذه المعرفة مفيدة لمعرفة عند تحليل اختناقات الأداء.
+
+هذا الملخص مُشتق من [نقل البيانات هو كل ما تحتاجه: دراسة حالة حول تحسين المحولات 2020](https://arxiv.org/abs/2007.00072)
+
+
+## تشريح ذاكرة النموذج
+
+لقد رأينا أن تدريب النموذج يستخدم ذاكرة أكثر بكثير من مجرد وضع النموذج على GPU. ويرجع ذلك إلى 
+هناك العديد من المكونات أثناء التدريب التي تستخدم ذاكرة GPU. المكونات الموجودة في ذاكرة GPU هي التالية:
+
+1. أوزان النموذج
+2. الدول المُحسّن
+3. المُتدرجات
+4. تنشيطات المسار الأمامي المحفوظة لحساب  المُتدرجات
+5. المخازن المؤقتة
+6. ذاكرة محددة الوظائف
+
+يتطلب نموذج نموذجي مدرب بدقة مختلطة 18 بايت للمُحسّن AdamW كل معلمة نموذج بالإضافة إلى ذاكرة التنشيط. للاستدلال لا توجد حالات مُحسّن و مُتدرجات، لذلك يمكننا طرح تلك. وهكذا ننتهي مع 6 بايت لكل 
+معلمة نموذج للدقة المختلطة الاستدلال، بالإضافة إلى ذاكرة التنشيط.
+
+دعنا نلقي نظرة على التفاصيل.
+
+**أوزان النموذج:**
+
+- 4 بايت * عدد المعلمات للتدريب على دقة fp32
+- 6 بايت * عدد المعلمات لتدريب الدقة المختلطة (يحافظ على نموذج في fp32 وآخر بدقة fp16 في الذاكرة)
+
+**حالات المُحسّن:**
+
+- 8 بايت * عدد المعلمات للمُحسّن AdamW العادي (يحافظ على حالتين)
+- 2 بايت * عدد المعلمات لمُحسّنات 8 بت AdamW مثل [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
+- 4 بايت * عدد المعلمات لمُحسّنات مثل SGD مع الزخم momentum (يحافظ على حالة واحدة فقط)
+
+**المُتدرجات**
+
+- 4 بايت * عدد المعلمات للتدريب بدقة fp32 أو بدقة مختلطة (المُتدرجات تكون دائمًا بدقة fp32)
+
+**تنشيطات المسار الأمامي**
+
+- يعتمد الحجم على العديد من العوامل، وأهمها طول التسلسل وحجم المخفية وحجم الدُفعة.
+
+هناك  المدخلات والمخرجات لذي يتم تمريرها وإرجاعها بواسطة وظائف المسار  الأمامي والمسار الخلفي وتنشيطات المسار الأمامي المحفوظة لحساب المُتدرجات.
+
+**الذاكرة المؤقتة**
+
+بالإضافة إلى ذلك، هناك جميع أنواع المتغيرات المؤقتة التي يتم تحريرها بمجرد الانتهاء من الحساب، ولكن في 
+لحظة يمكن أن تتطلب هذه المتغيرات المؤقتة ذاكرة إضافية ويقد تؤدي إلى نفاد  الذاكرة المُخصصة (OOM). لذلك، عند البرمجة، من المهم التفكير بشكل استراتيجي حول هذه المتغيرات المؤقتة وأحيانًا تحريرها بشكل صريح بمجرد عدم الحاجة إليها.
+
+**ذاكرة محددة الوظائف**
+
+ثم، قد يكون لبرنامجك احتياجات خاصة بالذاكرة. على سبيل المثال، عند إنشاء نص باستخدام البحث الشعاعي، يحتاج البرنامج 
+إلى الاحتفاظ بنسخ متعددة من المدخلات والمخرجات.
+
+**سرعة تنفيذ `forward` مقابل `backward`**
+
+بالنسبة للالتفافات والطبقات الخطية، هناك ضِعف عدد العمليات 2x flops في المسار الخلفى مقارنة بالمسار الأمامي، والتي يُترجم عمومًا إلى ~2x أبطأ (أحيانًا أكثر، لأن الأحجام في المسار الخلفى تميل إلى أن تكون أكثر صعوبة). عادةً ما تكون عمليات التنشيط محدودة بعرض  النطاق  الترددي، ومن المعتاد أن يتعين على التنشيط قراءة المزيد من البيانات في المسار الخلفى أكثر من المسار الأمامى.
+(على سبيل المثال، قراءة التنشيط المسار الأمامى مرة واحدة، وتكتب مرة واحدة، وبينما  تقرأ  عملية التنشيط الخلفي مرتين، gradOutput وإخراج الأمام، وتكتب مرة واحدة، gradInput).
+
+كما ترى، هناك بضعة أماكن يمكننا فيها توفير ذاكرة GPU أو تسريع العمليات. 
+الآن بعد أن فهمت ما يؤثر على استخدام GPU وسرعة الحساب، راجع 
+صفحة وثائق [أساليب وأدوات التدريب الفعال على GPU واحد](perf_train_gpu_one) لمعرفة المزيد حول تقنيات تحسين الأداء.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/model_sharing.md b/docs/transformers/docs/source/ar/model_sharing.md
new file mode 100644
index 0000000000000000000000000000000000000000..b802eb3ef038f0bb853599307479e6efb9d6e618
--- /dev/null
+++ b/docs/transformers/docs/source/ar/model_sharing.md
@@ -0,0 +1,223 @@
+# شارك نموذجك مع العالم
+
+أظهرت آخر درسين تعليميين كيفية ضبط نموذج بدقة باستخدام PyTorch و Keras و 🤗 Accelerate لعمليات التهيئة الموزعة. والخطوة التالية هي مشاركة نموذجك مع المجتمع! في Hugging Face، نؤمن بالمشاركة المفتوحة للمعرفة والموارد لتمكين الجميع من الاستفادة من الذكاء الاصطناعي. ونشجعك على مشاركة نموذجك مع المجتمع لمساعدة الآخرين على توفير الوقت والموارد.
+
+في هذا الدرس، ستتعلم طريقتين لمشاركة نموذجك المدرب أو مضبوط على منصة [Model Hub](https://huggingface.co/models):
+
+- رفع ملفاتك إلى منصة Hub مباشرة باستخدام الكود البرمجي.
+
+- قم بسحب وإفلات ملفاتك إلى Hub باستخدام الواجهة web.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="مشغل فيديو YouTube"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+لمشاركة نموذج مع المجتمع، تحتاج إلى حساب على [huggingface.co](https://huggingface.co/join). يمكنك أيضًا الانضمام إلى منظمة موجودة أو إنشاء منظمة جديدة.
+
+</Tip>
+
+## ميزات المستودع
+
+يعمل كل مستودع على Model Hub مثل مستودع GitHub النتقليدي. تقدم مستودعاتنا التحكم في الإصدارات وسجل التغييرات، وقدرة على رؤية الاختلافات بين الإصدارات.
+
+تعتمد آلية التحكم في الإصدارات على منصة Model Hub على نظامي git و [git-lfs](https://git-lfs.github.com/). وبعبارة أخرى، يمكنك التعامل مع  كل نموذج كأنه مستودع مستقل، مما يمكّن من زيادة التحكم في الوصول والقابلية للتطوير. يسمح التحكم في الإصدار بإجراء تعديلات وتثبيت إصدار محدد من النموذج باستخدام رمز التغيير (commit hash) أو وسم (tag) أو فرع (branch).
+
+بفضل هذه الميزة، يمكنك تحميل إصدار محدد من النموذج باستخدام معلمة الإصدار "revision":
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام
+... )
+```
+
+من السهل أيضًا تعديل الملفات  الموجودة داخل مستودع، ويمكنك عرض سجل التغييرات التي طرأت على هذه الملفات ومعاينة الاختلافات بين الإصدارات المختلفة:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## الإعداد
+
+قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):
+
+```bash
+huggingface-cli login
+```
+
+إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+ثم استخدم `notebook_login` لتسجيل الدخول إلى Hub، واتبع الرابط [هنا](https://huggingface.co/settings/token) لإنشاء رمز للتسجيل:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+
+## تحويل النموذج ليتوافق مع جميع الأطر العمل
+
+لضمان إمكانية استخدام نموذجك من قبل شخص يعمل بإطار عمل مختلف، نوصي بتحويل نموذجك ورفعه مع نقاط التحقق من PyTorch و TensorFlow. في حين أن المستخدمين لا يزال بإمكانهم تحميل نموذجك من إطار عمل مختلف إذا تخطيت هذه الخطوة، إلا أنه سيكون أبطأ لأن 🤗 Transformers ستحتاج إلى تحويل نقطة التحقق أثناء التشغيل.
+
+تحويل نقطة التحقق لإطار عمل آخر أمر سهل. تأكد من تثبيت PyTorch و TensorFlow (راجع [هنا](installation) لتعليمات التثبيت)، ثم ابحث عن النموذج الملائم لمهمتك في الإطار الآخر.
+
+<frameworkcontent>
+<pt>
+حدد `from_tf=True` لتحويل نقطة تحقق من TensorFlow إلى PyTorch:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+حدد `from_pt=True` لتحويل نقطة تحقق من PyTorch إلى TensorFlow:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+بعد ذلك، يمكنك حفظ نموذج TensorFlow الجديد بنقطة التحقق الجديدة:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+إذا كان النموذج متاحًا في Flax، فيمكنك أيضًا تحويل نقطة تحقق من PyTorch إلى Flax:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## دفع نموذج أثناء التدريب
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+مشاركة نموذجك على Hub مر بسيط للغاية كل ما عليك هو إضافة معلمة أو استدعاء رد إضافي. كما تذكر من درس [التدريب الدقيق](training)، فإن فئة [`TrainingArguments`] هي المكان الذي تحدد فيه المعلمات الفائقة وخيارات التدريب الإضافية. تشمل إحدى خيارات التدريب هذه القدرة على دفع النموذج مباشرة إلى المنصة Hub. قم بتعيين `push_to_hub=True` في [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+مرر معامﻻت التدريب كالمعتاد إلى [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+بعد ضبط نموذجك بدقة، يمكنك استخدام دالة [`~transformers.Trainer.push_to_hub`] المتاحة في [`Trainer`] لدفع النموذج المدرب إلى المنصة Hub. سوف تضيف 🤗 Transformers تلقائيًا المعلمات الفائقة المستخدمة في التدريب ونتائج التدريب وإصدارات الإطار إلى بطاقة معلومات النموذج الخاصة بك!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+شارك نموذجًا على Hub باستخدام [`PushToHubCallback`]. في دالة [`PushToHubCallback`], أضف:
+
+- دليل إخراج لنموذجك.
+- مُجزّئ اللغوي.
+- `hub_model_id`، والذي هو اسم مستخدم Hub واسم النموذج الخاص بك.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+أضف الاستدعاء إلى [`fit`](https://keras.io/api/models/model_training_apis/)، وسيقوم 🤗 Transformers بدفع النموذج المدرب إلى Hub:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## استخدام دالة `push_to_hub`
+
+يمكنك أيضًا استدعاء `push_to_hub` مباشرة على نموذجك لتحميله إلى Hub.
+
+حدد اسم نموذجك في `push_to_hub`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+ينشئ هذا مستودعًا تحت اسم المستخدم الخاص بك باسم نموذج `my-awesome-model`. يمكن للمستخدمين الآن تحميل نموذجك باستخدام دالة `from_pretrained`:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+إذا كنت تنتمي إلى منظمة وتريد دفع نموذجك تحت اسم المنظمة بدلاً من ذلك، فما عليك سوى إضافته إلى `repo_id`:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+يمكن أيضًا استخدام دالة `push_to_hub` لإضافة ملفات أخرى إلى مستودع النماذج. على سبيل المثال، أضف رموزًا إلى مستودع نموذج:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+أو ربما تريد إضافة إصدار TensorFlow من نموذج PyTorch المضبوط:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+الآن عند الانتقال إلى ملفك الشخصي على Hugging Face، يجب أن ترى مستودع النماذج الذي أنشأته حديثًا. سيؤدي النقر فوق علامة التبويب **Files** إلى عرض جميع الملفات التي قمت بتحميلها في المستودع.
+
+للحصول على مزيد من التفاصيل حول كيفية إنشاء الملفات وتحميلها إلى مستودع، راجع وثائق Hub [هنا](https://huggingface.co/docs/hub/how-to-upstream).
+
+## التحميل باستخدام الواجهة web
+
+يمكن للمستخدمين الذين يفضلون نهج عدم الترميز تحميل نموذج من خلال واجهة Hub web. قم بزيارة [huggingface.co/new](https://huggingface.co/new) لإنشاء مستودع جديد:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+من هنا، أضف بعض المعلومات حول نموذجك:
+
+- حدد **مالك** المستودع. يمكن أن يكون هذا أنت أو أي من المنظمات التي تنتمي إليها.
+- اختر اسمًا لنموذجك، والذي سيكون أيضًا اسم المستودع.
+- اختر ما إذا كان نموذجك عامًا أم خاصًا.
+- حدد ترخيص الاستخدام لنموذجك.
+
+الآن انقر فوق علامة التبويب **Files** ثم انقر فوق الزر **Add file** لإضافة ملف جديد إلى مستودعك. ثم اسحب وأسقط ملفًا لتحميله وأضف رسالة الالتزام.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## إضافة بطاقة نموذج
+
+للتأكد من فهم المستخدمين لقدرات نموذجك وقيوده وتحيزاته المحتملة واعتباراته الأخلاقية، يرجى إضافة بطاقة نموذج إلى مستودعك. يتم تعريف بطاقة النموذج في ملف `README.md`. يمكنك إضافة بطاقة نموذج عن طريق:
+
+* قم بإنشاء ملف `README.md` وتحميله يدويًا.
+* انقر فوق الزر **Edit model card** في مستودع نموذجك.
+
+الق نظرة على بطاقة [DistilBert](https://huggingface.co/distilbert/distilbert-base-uncased) للحصول على مثال جيد على نوع المعلومات التي يجب أن تتضمنها بطاقة النموذج. للحصول على مزيد من التفاصيل حول الخيارات الأخرى التي يمكنك التحكم فيها في ملف `README.md` مثل البصمة الكربونية للنموذج أو أمثلة الأداة، راجع الوثائق [هنا](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/model_summary.md b/docs/transformers/docs/source/ar/model_summary.md
new file mode 100644
index 0000000000000000000000000000000000000000..57de935b70f4597619786d5ced1906b5186d2284
--- /dev/null
+++ b/docs/transformers/docs/source/ar/model_summary.md
@@ -0,0 +1,89 @@
+# عائلة نماذج المحول
+
+منذ إطلاقه في عام 2017، ألهم نموذج [المحول الأصلي](https://arxiv.org/abs/1706.03762) (راجع مدونة [المحول المشروح](http://nlp.seas.harvard.edu/2018/04/03/attention.html) لمقدمة تقنية مبسطة)، ألهم العديد من النماذج الجديدة والمبتكرة التي تتجاوز مهام معالجة اللغات الطبيعية (NLP). هناك نماذج للتنبؤ [بالبنية البروتينات المطوية](https://huggingface.co/blog/deep-learning-with-proteins)، و[تدريب على اتخاذ القرار](https://huggingface.co/blog/train-decision-transformers)، و[التنبؤ بالسلاسل الزمنية](https://huggingface.co/blog/time-series-transformers). مع وجود العديد من متغيرات المحول المتاحة، قد يكون من السهل أن تفوتك الصورة الأكبر. ما تشترك فيه جميع هذه النماذج هو أنها تستند إلى بنية المحول الأصلية. تستخدم بعض النماذج فقط الترميز أو فك الترميز، بينما تستخدم نماذج أخرى كليهما. يوفر هذا تصنيفًا مفيدًا لتصنيف واستعراض الفروقات الرئيسية بين نماذج عائلة المحولات، وسيساعدك على فهم النماذج التي لم تصادفها من قبل.
+
+إذا لم تكن على دراية بنموذج المحول الأصلي أو تحتاج إلى تذكير، فراجع الفصل الخاص بـ [كيف تعمل المحولات](https://huggingface.co/course/chapter1/4؟fw=pt) من دورة Hugging Face.
+
+<div align="center">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/H39Z_720T5s" title="مشغل فيديو YouTube" frameborder="0" allow="accelerometer؛ تشغيل تلقائي؛ قائمة تشغيل مدمجة؛ محسّنات الفيديو؛ ميزة الإشارات المرجعية" allowfullscreen></iframe>
+</div>
+
+## رؤية الحاسب (Computer vision)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FacQBpeFBVvrDUlzFlkejoz%2FModelscape-timeline%3Fnode-id%3D0%253A1%26t%3Dm0zJ7m2BQ9oe0WtO-1" allowfullscreen></iframe> 
+
+### الشبكة التلافيفية (Convolutional network)
+
+لطالما كانت الشبكات التلافيفية (CNNs) الطريقة السائدة لمهام رؤية الحاسب حتى برز [محول الرؤية](https://arxiv.org/abs/2010.11929) قابليته للتطوير وكفاءته العالية. وحتى بعد ذلك، لا تزال بعض أفضل صفات CNN، مثل ثبات الإزاحة، قوية جدًا (خاصة بالنسبة لمهام معينة) لدرجة أن بعض المحولات تدمج التلافيف في بنيتها. قلب [ConvNeXt](model_doc/convnext) هذا التبادل رأسًا على عقب وأدرج خيارات التصميم من المحولات لتحديث CNN. على سبيل المثال، يستخدم ConvNeXt نوافذ منزلقة غير متداخلة لتقسيم الصورة إلى رقع وزيادة حقل مجال العام الخاص بها. كما يقوم ConvNeXt بعدة خيارات مثل تصميم الطبقة لتكون أكثر كفاءة في الذاكرة وتحسين الأداء، مما يجعله منافسًا قويًا للمحولات!
+
+### الترميز[[cv-encoder]] (Encoder)
+
+فتح [محول الرؤية (ViT)](model_doc/vit) الباب أمام مهام رؤية الحاسب دون الاعتماد على التلافيف. يستخدم ViT ترميز محول قياسي، لكن إنجازه الرئيسي كان طريقة معالجته للصورة. فهو تقسّم الصورة إلى رقّعات ذات حجم ثابت ويستخدمها لإنشاء تضمين، تمامًا مثل تقسيم الجملة إلى رموز. استفاد ViT من بنية المُحوِّلات الفعالة لإظهار نتائج تنافسية مع CNNs في ذلك الوقت مع الحاجة إلى موارد أقل للتدريب. وسرعان ما تبع ViT نماذج رؤية أخرى يمكنها أيضًا التعامل مع مهام الرؤية الكثيفة  مثل التجزئة والتعرف.
+
+من بين هذه النماذج [Swin](model_doc/swin) Transformer. فهو يبني خرائط سمات هرمية (مثل CNN 👀 على عكس ViT) من رقّعات أصغر حجمًا ودمجها مع الرقع المجاورة في طبقات أعمق. يتم حساب الانتباه فقط ضمن نافذة محلية، ويتم تحويل النافذة بين طبقات الانتباه لإنشاء اتصالات تساعد النموذج على التعلم بشكل أفضل. نظرًا لأن محول Swin يمكنه إنتاج خرائط خصائص هرمية، فهو مرشح جيد لمهام التنبؤ الكثيفة مثل التجزئة والتعرف. كما يستخدم [SegFormer](model_doc/segformer) ترميز محول لبناء خرائط خصائص هرمية، ولكنه يضيف فك تشفير بسيط متعدد الطبقات (MLP) في الأعلى لدمج جميع خرائط الخصائص وإجراء تنبؤ.
+
+استلهمت نماذج الرؤية الأخرى، مثل BeIT وViTMAE، الإلهام من هدف التدريب المسبق لـ BERT. يتم تدريب [BeIT](model_doc/beit) مسبقًا من خلال *نمذجة الصور المقنعة (MIM)*؛ يتم إخفاء رقّعات الصور بشكل عشوائي، كما يتم تحويل الصورة إلى رموز بصرية. يتم تدريب BeIT للتنبؤ بالرموز البصرية المُناظرة للرقع المخفية. لدى [ViTMAE](model_doc/vitmae) هدف تدريب مسبق مُماثل، باستثناء أنه يجب عليه التنبؤ بالبكسلات بدلاً من الرموز البصرية. ما هو غير عادي هو أن إخفاء 75% من رقع الصور! يقوم فك التشفير بإعادة بناء البكسلات من الرموز المخفية والرقّعات المشفرة. بعد التدريب المسبق، يتم التخلص من فك التشفير، ويصبح الترميز جاهزًا للاستخدام في مهام التالية.
+
+### فك التشفير[[cv-decoder]] (Decoder)
+
+نادرًا ما تستخدم نماذج الرؤية التي تعتمد على فك التشفير فقط لأن معظم نماذج الرؤية تعتمد على الترميز لتعلم تمثيل الصورة. ولكن بالنسبة للاستخدامات مثل توليد الصور، يعد فك التشفير مناسبًا بشكل طبيعي، كما رأينا من نماذج توليد النصوص مثل GPT-2. يستخدم نموذج [ImageGPT](model_doc/imagegpt) نفس بنية GPT-2، ولكنه بدلاً من التنبؤ بالرمز التالي في تسلسل، فإنه يتنبأ بالبكسل التالي في صورة. بالإضافة إلى توليد الصور، يمكن أيضًا ضبط ImageGPT بدقة لتصنيف الصور.
+
+### الترميز وفك التشفير[[cv-encoder-decoder]] (Encoder-decoder)
+
+تستخدم نماذج الرؤية بشكل شائع ترميزًا (يُعرف أيضًا باسم العمود الفقري) لاستخراج ميزات الصورة المهمة قبل تمريرها إلى فك التشفير لنموذج المُحوّل. يستخدم [DETR](model_doc/detr) عمودًا فقريًا مُدربًا مسبقًا، ولكنه يستخدم أيضًا الببنية الكاملة للترميز وفك تشفير لنموذج المحول للكشف عن الأشياء. يتعلم الترميز تمثيلات الصور ويجمعها مع استعلامات الكائنات (كل استعلام كائن هو تضمين مُتعلم يركز على منطقة أو كائن في صورة) في فك التشفير. يتنبأ DETR بإحداثيات مربع الحدود وتسمية الفئة لكل استعلام كائن.
+
+## معالجة اللغات الطبيعية (Natural language processing - NLP)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FUhbQAZDlpYW5XEpdFy6GoG%2Fnlp-model-timeline%3Fnode-id%3D0%253A1%26t%3D4mZMr4r1vDEYGJ50-1" allowfullscreen></iframe>
+
+### الترميز اللغوي[[nlp-encoder]]
+
+نموذج [BERT](model_doc/bert) هو محوّل (Transformer)  يعتمد على الترميز فقط يقوم بشكل عشوائي بإخفاء رموز معينة في المدخلات لتجنب رؤية باقى الرموز الأخرى، مما يسمح له "بالغش". يتمثل هدف التدريب المسبق في التنبؤ بالرمز المخفي بناءً على السياق. يسمح هذا لـ BERT باستخدام السياقات اليمنى واليسرى بالكامل لمساعدته في تعلم تمثيل أعمق وأغنى للبيانات المدخلة. ومع ذلك، كان هناك مجال للتحسين في استراتيجية التدريب المسبق لـ BERT. نموذج [RoBERTa](model_doc/roberta) اضاف تحسين من خلال تقديم وصفة تدريب مسبق جديدة تشمل التدريب لفترة أطول وعلى دفعات أكبر، وإخفاء الرموز عشوائيًا في كل حقبة بدلاً من مرة واحدة فقط أثناء المعالجة المسبقة، وإزالة هدف التنبؤ بالجملة التالية.
+
+تتمثل الاستراتيجية السائدة لتحسين الأداء في زيادة حجم النموذج. ولكن تدريب النماذج الكبيرة مكلف من الناحية الحسابية. إحدى طرق تقليل التكاليف الحسابية هي استخدام نموذج أصغر مثل [DistilBERT](model_doc/distilbert). يستخدم DistilBERT [ تقنية تقطير المعرفة](https://arxiv.org/abs/1503.02531) - وهي تقنية ضغط - لإنشاء نموذج أصغر من BERT مع الحفاظ على معظم قدراته على فهم اللغةا.
+
+مرت معظم نماذج المحول في الاتجاه نحو المزيد من المعلمات، مما أدى إلى ظهور نماذج جديدة تركز على تحسين كفاءة التدريب. يقلّل [ALBERT](model_doc/albert) من استهلاك الذاكرة عن طريق تقليل عدد المعلمات بطريقتين: فصل تضمين المفردات الأكبر إلى مصفوفتين أصغر والسماح للمستويات بمشاركة المعلمات. أضاف [DeBERTa](model_doc/deberta) آلية انتباه منفصلة حيث يتم ترميز الكلمة وموضعها بشكل منفصل في متجهين. يتم حساب الانتباه من هذه المتجهات المنفصلة بدلاً من متجه واحد يحتوي على تضمين الكلمة والموقع. ركز [Longformer](model_doc/longformer) أيضًا على جعل الانتباه أكثر كفاءة، خاصة لمعالجة المستندات ذات تسلسلات أطولل. فهو يستخدم مزيجًا من  انتباه النوافذ المحلية (يتم حساب الانتباه فقط ن نافذة ذات حجم ثابت حول كل رمز) والانتباه العام (فقط لرموز مهمة محددة مثل `[CLS]` للتصنيف) لإنشاء مصفوفة انتباه متفرقة بدلاً من مصفوفة انتباه كاملة.
+
+### فك التشفير[[nlp-decoder]]
+
+ نموذج [GPT-2](model_doc/gpt2) هو محول فك تشفير فقط يتنبأ بالكلمة التالية في التسلسل. إنه يخفي الرموز التالية الموجودة على اليمين حتى لا يتمكن النموذج من "الغش" بالنظر إليها. من خلال التدريب المسبق على  كميات هائلة من النصوص، أصبح [GPT-2](model_doc/gpt2) بارعًا في توليد النصوص، حتى لو لم تكن النص دقيقًا أو صحيحًا في بعض الأحيان فقط. ولكن كان يفتقر إلى سياق لترابط المتبادل (bidirectional context) الموجود من التدريب المسبق لـ  [BERT](model_doc/bert) ، مما جعله غير مناسب لمهام معينة. يجمع [XLNET](model_doc/xlnet) بين أفضل ما في أهداف التدريب المسبق لـ [BERT](model_doc/bert) و [GPT-2](model_doc/gpt2) من خلال اعتماد نهج النمذجة اللغوية باستخدام التباديل (Permutation Language Modeling - PLM) الذي يسمح له بتعلم الترابط ثنائي الاتجاه.
+
+بعد ظهور [GPT-2](model_doc/gpt2)، تطورت النماذج اللغوية بشكل أكبر حجمًا وأكثر تعقيدًا وأصبحت تُعرف الآن باسم *نماذج اللغة الكبيرة (LLMs)*. توضح LLMs  مهارات تعلم قليلة الكمية أو حتى معدومة إذا تم تدريبها على مجموعة بيانات كبيرة بما يكفي. [GPT-J](model_doc/gptj) هو LLM به 6 مليارات معلمة مدربة على 400 مليار رمز. تبعه نموذج [OPT](model_doc/opt)، وهي عائلة من نماذج فك التشفير فقط، أكبرها 175 مليار معلمة ودُرب على 180 مليار رمز. تم إصدار [BLOOM](model_doc/bloom) في نفس الوقت تقريبًا، ويحتوي أكبر نموذج في العائلة على 176 مليار معلمة ودُرب على 366 مليار رمز في 46 لغة و13 لغة برمجة.
+
+### الترميز وفك التشفير[[nlp-encoder-decoder]]
+
+يحتفظ [BART](model_doc/bart) ببنية المحول الأصلية، ولكنه يعدّل هدف التدريب المسبق باستخدام إفساد *إدخال النصوص*، حيث يتم استبدال بعض نطاقات النص برمز `mask` واحد. يتنبأ فك التشفير بالرموز غير الفاسدة (يتم إخفاء الرموز المستقبلية) ويستخدم حالات الترميز المخفية للمساعدة. [Pegasus](model_doc/pegasus) مشابه لـ BART، ولكن Pegasus يقوم بإخفاء جمل كاملة بدلاً من مقاطع النص. بالإضافة إلى نمذجة اللغة المقنعة، يتم تدريب Pegasus مسبقًا بواسطة توليد الجمل الفارغة (GSG). يقوم هدف GSG بإخفاء الجمل الكاملة المهمة للمستند، واستبدالها برمز `mask`. يجب على فك التشفير توليد المخرجات من الجمل المتبقية. [T5](model_doc/t5) هو نموذج فريد من نوعه يحوّل جميع مهام معالجة اللغة الطبيعية إلى مشكلة نص إلى نص باستخدام بادئات محددة. على سبيل المثال، يشير البادئة `Summarize:` إلى مهمة تلخيص. يتم تدريب T5 مسبقًا بواسطة التدريب الخاضع للإشراف (GLUE وSuperGLUE) والتدريب ذاتي الإشراف (اختيار عينة عشوائية وحذف 15% من الرموز).
+
+## الصوت (Audio)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2Fvrchl8jDV9YwNVPWu2W0kK%2Fspeech-and-audio-model-timeline%3Fnode-id%3D0%253A1%26t%3DmM4H8pPMuK23rClL-1" allowfullscreen></iframe>
+
+### الترميز[[audio-encoder]]
+
+يستخدم [Wav2Vec2](model_doc/wav2vec2) ترميز من نوع المحوّل لتعلم تمثيلات الكلام بشكلٍ مباشر من موجات الصوت الخام. يتم تدريبه مسبقًا باستخدام مهمة تباينية لتحديد تمثيل الكلام الصحيح من مجموعة من التمثيلات الخاطئة. [HuBERT](model_doc/hubert) مشابه لـ Wav2Vec2 ولكنه له عملية تدريب مختلفة. يتم إنشاء تسميات الهدف عن طريق خطوة تجميع يتم فيها ت تخصيص مقاطع الصوت المتشابهة إلى مجموعات، تُصبح كل واحدة منها وحدةً خفية. ويتم تعيين الوحدة الخفية إلى  تمثيل لإجراء تنبؤ.
+
+### الترميز وفك التشفير[[audio-encoder-decoder]]
+
+[Speech2Text](model_doc/speech_to_text) هو نموذج كلام مصمم للتعرف التلقائي على الكلام (ASR) وترجمة الكلام. يقبل النموذج ميزات بنك المرشح اللغوي التي تم استخراجها من شكل موجة الصوت وتم تدريبه مسبقًا بطريقة ذاتية التعلم لتوليد نسخة أو ترجمة. [Whisper](model_doc/whisper) هو أيضًا نموذج ASR، ولكنه على عكس العديد من نماذج الكلام الأخرى، يتم تدريبه مسبقًا على كمية كبيرة من بيانات نسخ النص الصوتي ✨ المسماة ✨ لتحقيق الأداء الصفري. يحتوي جزء كبير من مجموعة البيانات أيضًا على لغات غير اللغة الإنجليزية، مما يعني أنه يمكن استخدام Whisper أيضًا للغات منخفضة الموارد. من الناحية الهيكلية، يشبه Whisper نموذج Speech2Text. يتم تحويل إشارة الصوت إلى طيف لوجاريتم مل-ميل يتم تشفيره بواسطة الترميز. يقوم فك التشفير بتوليد النسخة بطريقة ذاتية التعلم من حالات الترميز المخفية والرموز السابقة.
+
+## متعدد الوسائط (Multimodal)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FcX125FQHXJS2gxeICiY93p%2Fmultimodal%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### Encoder[[mm-encoder]]
+
+نموذج [VisualBERT](model_doc/visual_bert) هو نموذج متعدد الوسائط لمهام الرؤية اللغوية تم إصداره بعد فترة وجيزة من BERT. فهو يجمع بين BERT ونظام اكتشاف كائن مسبق التدريب لاستخراج ميزات الصورة في تضمينات بصرية، يتم تمريرها جنبًا إلى جنب مع التضمينات النصية إلى BERT. يتنبأ VisualBERT بالنص المقنع بناءً على النص غير المقنع والتضمينات المرئية، ويجب عليه أيضًا التنبؤ بما إذا كان النص متوافقًا مع الصورة. عندما تم إصدار ViT، اعتمد [ViLT](model_doc/vilt) ViT في بنيتها لأنه كان من الأسهل الحصول على تضمينات الصورة بهذه الطريقة. يتم معالجة تضمينات الصورة بشكل مشترك مع التضمينات النصية. ومن هناك، يتم التدريب المسبق لـ ViLT بواسطة مطابقة الصورة النصية، ونمذجة اللغة المقنعة، وإخفاء كلمة كاملة.
+
+يتّبع [CLIP](model_doc/clip) نهجًا مختلفًا ويقوم بتنبؤ ثنائي من ("الصورة"، "النص"). يتم تدريب مشفر صورة (ViT) ومشفر نص (Transformer) بشكل مشترك على مجموعة بيانات مكونة من 400 مليون ثنائي من ("صورة"، "نص") لتعظيم التشابه بين متجهات  ترميز الصورة  ومتجهات النص ثنائي ("الصورة"، "النص"). بعد التدريب المسبق، يمكنك استخدام اللغة الطبيعية لتوجيه CLIP للتنبؤ بالنص المُعطى بناءً على صورة أو العكس بالعكس. [OWL-ViT](model_doc/owlvit) يبني على CLIP باستخدامه كعمود فقري للكشف عن الكائنات بدون إشراف. بعد التدريب المسبق، يتم إضافة رأس كشف الأجسام لإجراء تنبؤ بمجموعة مُحدّد عبر ثنائيات ("class"، "bounding box").
+
+### Encoder-decoder[[mm-encoder-decoder]]
+
+التعرّف البصري على الحروف (OCR)   مهمة  قديمة  لتعرّف  النصوص،  التي تنطوي عادةً على عدة مكونات لفهم الصورة وتوليد النص. [TrOCR](model_doc/trocr) بتبسيط العملية باستخدام محول متكامل من النهاية إلى النهاية. المشفر هو نموذج على غرار ViT لفهم الصورة ويعالج الصورة كقطع ثابتة الحجم. يقبل فك التشفير حالات الإخفاء للمشفر وينشئ النص بشكل تلقائي. [Donut](model_doc/donut) هو نموذج أكثر عمومية لفهم المستندات المرئية لا يعتمد على نهج OCR. يستخدم محول  Swin كمشفر وBART متعدد اللغات كمُفكّك تشفير. يتم تدريب Donut على قراءة النص عن طريق التنبؤ بالكلمة التالية بناءً على ملاحظات الصورة والنص. يقوم فك التشفير بتوليد تتسلسلًا  رمزيًا  بناءً على موجه (Prompt). يتم تمثيل الموجه بواسطة رمز خاص لكل مهمة. على سبيل المثال، يحتوي تحليل المستند على رمز خاص "parsing" يتم دمجه مع حالات الإخفاء للـمُشفّر لتحليل المستند بتنسيق إخراج منظم (JSON).
+
+## التعلم التعزيزي (Reinforcement learning - RL)
+
+<iframe style="border: 1px solid rgba(0, 0, 0, 0.1);" width="1000" height="450" src="https://www.figma.com/embed?embed_host=share&url=https%3A%2F%2Fwww.figma.com%2Ffile%2FiB3Y6RvWYki7ZuKO6tNgZq%2Freinforcement-learning%3Fnode-id%3D0%253A1%26t%3DhPQwdx3HFPWJWnVf-1" allowfullscreen></iframe>
+
+### فك التشفير[[rl-decoder]]
+
+يقوم نموذج  "محوّل القرارات والمسارات"  (Decision and Trajectory Transformer)  بتحويل الحالة (State)  والإجراء (Action)  والمكافأة (Reward)  كمشكلة نمذجة تسلسلية. [محوّل القرارات](model_doc/decision_transformer) يقوم بتوليد سلسلة من الإجراءات التي تؤدي إلى عائد مرغوب في المستقبل بناءً على العوائد المتوقعة، والحالات  والإجراءات  السابقة. في  الخطوات  الزمنية  *K*   الأخيرة، يتم تحويل كل وسائط  البيانات  الثلاث vإلى متجهات تضمين رمزيّة  ومعالجتها بواسطة نموذج مشابه لـ GPT للتنبؤ برمز الإجراء المستقبلي.يقوم [محول المسار](model_doc/trajectory_transformer) أيضًا بتحويل الحالات  والإجراءات  والمكافآت إلى رموز ومعالجتها باستخدام  هيكلية  GPT. على عكس "محوّل القرارات"، الذي يركز على تكييف المكافأة، يقوم "محوّل المسارات" بتوليد إجراءات مستقبلية باستخدام البحث الشعاعي (Beam Search).
diff --git a/docs/transformers/docs/source/ar/modular_transformers.md b/docs/transformers/docs/source/ar/modular_transformers.md
new file mode 100644
index 0000000000000000000000000000000000000000..b500fec1c92d25efc0de93890c69cd9cafeefd67
--- /dev/null
+++ b/docs/transformers/docs/source/ar/modular_transformers.md
@@ -0,0 +1,184 @@
+# المحولات النمطية
+
+مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy).
+
+جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy)
+في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات.
+
+نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق.  يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات.
+على أجزاء محددة من التعليمات البرمجية.
+
+ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة.  فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة  CI والأوامر المحلية عدم تباعد النسخ.  لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً.  كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه.
+
+غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية،  معظمها أكواد نمطية.  هذا يرفع مستوى  المساهمات،
+
+ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول.
+
+## ما هو؟
+
+تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية
+غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك
+الوراثة من الفئات إلى فئات أخرى.
+
+يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم
+المتعلقة.
+
+وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد"
+هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة
+إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى.
+
+سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك،
+نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا.
+
+لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى
+تنسيق المحولات النمطية الجديد في الأشهر المقبلة.
+
+### التفاصيل
+
+تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة
+
+على سبيل المثال:
+- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً
+  (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف).
+- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا
+  سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية.
+- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا
+
+يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا.
+
+### التطبيق
+
+[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py`
+
+### الأمثلة
+
+هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين.
+
+بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا).
+
+```python
+from torch import nn
+from ..bert.configuration_bert import BertConfig
+from ..bert.modeling_bert import (
+    BertModel,
+    BertEmbeddings,
+    BertForMaskedLM
+)
+
+# تكوين RoBERTa مطابق لتكوين BERT
+class RobertaConfig(BertConfig):
+  model_type = 'roberta'
+
+# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية
+class RobertaEmbeddings(BertEmbeddings):
+    def __init__(self, config):
+        super().__init__(config())
+
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات.
+# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي
+class RobertaModel(BertModel):
+  def __init__(self, config):
+    super().__init__(config)
+    self.embeddings = RobertaEmbeddings(config)
+
+      
+# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح
+class RobertaForMaskedLM(BertForMaskedLM):
+  def __init__(self, config):
+    super().__init__(config)
+    self.model = RobertaModel(config)
+```
+
+لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي:
+
+```bash
+ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used
+                                    when you define `BertModel`, as it is one of it's direct dependencies. Make sure
+                                    you use it in the `__init__` function.
+```
+
+بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا:
+
+## ما هو ليس كذلك
+
+ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة.
+
+
+## الاستخدام المتقدم
+
+### إزالة السمات والوظائف
+لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة:
+
+```python
+class GemmaModel(LlamaModel):                 |           class GemmaModel(PreTrainedModel):
+    def __init__(self, config):               |              def __init__(self, config):
+        super().__init__(self, eos_token)     |                 super().__init__(config)
+        del self.embed_tokens                 |                 self.padding_idx = config.pad_token_id
+                                              |                 self.vocab_size = config.vocab_size
+                                              |
+                                              |                 self.layers = nn.ModuleList(
+                                              |                     [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+                                              |                 )
+                                              |                 self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                                              |                 self.rotary_emb = LlamaRotaryEmbedding(config=config)
+                                              |                 self.gradient_checkpointing = False
+                                              |                 
+                                              |                 # Initialize weights and apply final processing
+                                              |                 self.post_init()
+```
+إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!)
+
+إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون.
+
+```python
+class GemmaTokenizer(LlamaTokenizer):
+    ...
+
+    def get_spm_processor(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def unk_token_length(self):
+        raise AttributeError("Not needed for Gemma")
+```
+
+### تعريف وظائف جديدة
+
+إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال
+
+```python
+def my_new_function(*args, **kwargs):
+  # Do something here
+  pass
+
+class GemmaModel(LlamaModel):
+    def forward(*args, **kwargs):
+      # Call the function
+      example = my_new_function(*args, **kwargs)
+      # continue here
+```
+
+سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا
+في الملف الذي يتم استخدامه.
+
+### استدعاء `super()`
+قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من:
+```python
+class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast):         |           class GemmaModel(nn.Module):
+    def __init__(self, eos_token="</s>"):                              |             def __init__(self):
+        eos_token = AddedToken(eos_token)                              |                eos_token = AddedToken(eos_token)
+        PretrainedTokenizerFast.__init__(self, eos_token)              |                super().__init__(eos_token)
+```
+هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به!
+
+### التسمية الخاصة
+ندعم الآن أيضًا حالات خاصة مثل
+```python
+class GemmaVisionModel(CLIPModel):                                 
+    pass
+```
+حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة.
diff --git a/docs/transformers/docs/source/ar/multilingual.md b/docs/transformers/docs/source/ar/multilingual.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4b2a94fd40a33cfb683f88cd8f662df73cb4f0f
--- /dev/null
+++ b/docs/transformers/docs/source/ar/multilingual.md
@@ -0,0 +1,160 @@
+# النماذج متعددة اللغات للاستدلال
+
+هناك العديد من النماذج متعددة اللغات في مكتبة 🤗 Transformers، وتختلف طريقة استخدامها للاستدلال عن النماذج أحادية اللغة. ولكن ليس كل استخدام النماذج متعددة اللغات مختلف. فبعض النماذج، مثل [google-bert/bert-base-multilingual-uncased](https://huggingface.co/google-bert/bert-base-multilingual-uncased)، يمكن استخدامها تمامًا مثل النموذج أحادي اللغة. سيوضح لك هذا الدليل كيفية استخدام النماذج متعددة اللغات التي تختلف طريقة استخدامها للاستدلال.
+
+## XLM
+
+يحتوي XLM على عشر نسخ مختلفة، واحدة منها فقط أحادية اللغة. ويمكن تقسيم نسخ النماذج التسع المتبقية إلى فئتين: نسخ التي تستخدم تضمينات اللغة (language embeddings)  وتلك التي لا تستخدمها.
+
+### XLM مع تضمينات اللغة
+
+تستخدم النماذج التالية من XLM تضمينات اللغة لتحديد اللغة المستخدمة أثناء الاستدلال:
+
+- `FacebookAI/xlm-mlm-ende-1024` (نمذجة اللغة المقنعة، الإنجليزية-الألمانية)
+- `FacebookAI/xlm-mlm-enfr-1024` (نمذجة اللغة المقنعة، الإنجليزية-الفرنسية)
+- `FacebookAI/xlm-mlm-enro-1024` (نمذجة اللغة المقنعة، الإنجليزية-الرومانية)
+- `FacebookAI/xlm-mlm-xnli15-1024` (نمذجة اللغة المقنعة، لغات XNLI)
+- `FacebookAI/xlm-mlm-tlm-xnli15-1024` (نمذجة اللغة المقنعة + الترجمة، لغات XNLI)
+- `FacebookAI/xlm-clm-enfr-1024` (نمذجة اللغة السببية، الإنجليزية-الفرنسية)
+- `FacebookAI/xlm-clm-ende-1024` (نمذجة اللغة السببية، الإنجليزية-الألمانية)
+
+تُمثل تضمينات اللغة على شكل مصفوفة بنفس شكل  `input_ids` التي يتم تمريره إلى النموذج. وتعتمد القيم في هذه المصفوفات على اللغة المستخدمة ويتم تحديدها بواسطة معاملى المجزىء `lang2id` و `id2lang`.
+
+في هذا المثال، قم بتحميل نسخة `FacebookAI/xlm-clm-enfr-1024` ( نمذجة اللغة السببية، الإنجليزية-الفرنسية):
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024")
+```
+
+تُظهر خاصية `lang2id` في المجزىء اللغات وأرقام تعريفها في هذا النموذج:
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+بعد ذلك، قم بإنشاء مثال على المدخلات:
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
+```
+
+قم بتعيين معرف اللغة إلى `"en"` واستخدمه لتحديد تضمين اللغة. وتضمين اللغة عبارة عن مصفوفة مملوءة بـ `0` لأن هذا هو معرف اللغة الإنجليزية. يجب أن تكون هذه المصفوفة بنفس حجم `input_ids`.
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # نقوم بإعادة تشكيلها لتكون بالحجم (batch_size، sequence_length)
+>>> langs = langs.view(1, -1)  # الآن بالحجم [1، sequence_length] (لدينا batch size تساوي 1)
+```
+
+الآن يمكنك تمرير `input_ids` وتضمين اللغة إلى النموذج:
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+يمكن لنص البرنامج النصي [run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) توليد النص باستخدام تضمينات اللغة مع نقاط تفتيش `xlm-clm`.
+
+### XLM بدون تضمينات اللغة
+
+النماذج التالية من XLM لا تتطلب تضمينات اللغة أثناء الاستنتاج:
+
+- `FacebookAI/xlm-mlm-17-1280` (نمذجة اللغة المقنعة، 17 لغة)
+- `FacebookAI/xlm-mlm-100-1280` (نمذجة اللغة المقنعة، 100 لغة)
+
+تُستخدم هذه النماذج لتمثيل الجمل العامة، على عكس نسح XLM السابقة.
+
+## BERT
+
+يمكن استخدام النماذج التالية من BERT للمهام متعددة اللغات:
+
+- `google-bert/bert-base-multilingual-uncased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 102 لغة)
+- `google-bert/bert-base-multilingual-cased` (نمذجة اللغة المقنعة + التنبؤ بالجملة التالية، 104 لغات)
+
+لا تتطلب هذه النماذج تضمينات اللغة أثناء الاستدلال. يجب أن تُحدّد اللغة من السياق وتستنتج وفقاً لذلك.
+
+## XLM-RoBERTa
+
+يمكن استخدام النماذج التالية من XLM-RoBERTa للمهام متعددة اللغات:
+
+- `FacebookAI/xlm-roberta-base` (نمذجة اللغة المقنعة، 100 لغة)
+- `FacebookAI/xlm-roberta-large` (نمذجة اللغة المقنعة، 100 لغة)
+
+تم تدريب XLM-RoBERTa على 2.5 تيرابايت من بيانات CommonCrawl الجديدة والمحسنة في 100 لغة. ويوفر مكاسب قوية على النماذج متعددة اللغات التي تم إصدارها سابقاً مثل mBERT أو XLM في مهام المصب مثل التصنيف، ووضع العلامات التسلسلية، والأسئلة والأجوبة.
+
+## M2M100
+
+يمكن استخدام النماذج التالية من M2M100 للترجمة متعددة اللغات:
+
+- `facebook/m2m100_418M` (الترجمة)
+- `facebook/m2m100_1.2B` (الترجمة)
+
+في هذا المثال، قم بتحميل نسحة  `facebook/m2m100_418M` لترجمة النص من الصينية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء اللغوى:
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+تقسيم النّص إلى رموز:
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+يجبر M2M100 معرف اللغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية:
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+يمكن استخدام النماذج التالية من MBart للترجمة متعددة اللغات:
+
+- `facebook/mbart-large-50-one-to-many-mmt` (الترجمة الآلية متعددة اللغات من واحد إلى كثير، 50 لغة)
+- `facebook/mbart-large-50-many-to-many-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى كثير، 50 لغة)
+- `facebook/mbart-large-50-many-to-one-mmt` (الترجمة الآلية متعددة اللغات من كثير إلى واحد، 50 لغة)
+- `facebook/mbart-large-50` (الترجمة متعددة اللغات، 50 لغة)
+- `facebook/mbart-large-cc25`
+
+في هذا المثال، قم بتحميل نسخة `facebook/mbart-large-50-many-to-many-mmt` لترجمة النص من الفنلندية إلى الإنجليزية. يمكنك تعيين اللغة المصدر في المجزىء:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+تقسيم النّص إلى رموز:
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+يجبر MBart معرف لغة الهدف كأول رمز مولد للترجمة إلى اللغة الهدف. قم بتعيين `forced_bos_token_id` إلى `en` في طريقة `generate` للترجمة إلى الإنجليزية:
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+إذا كنت تستخدم نسخة `facebook/mbart-large-50-many-to-one-mmt`، فلا تحتاج إلى إجبار معرف لغة الهدف كأول رمز مولد، وإلا فإن الاستخدام هو نفسه.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/notebooks.md b/docs/transformers/docs/source/ar/notebooks.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9b481a6c315980d7b6c9003abadd5500d511de2
--- /dev/null
+++ b/docs/transformers/docs/source/ar/notebooks.md
@@ -0,0 +1,140 @@
+# دفاتر ملاحظات 🤗 Transformers
+
+يمكنك أن تجد هنا قائمة بدفاتر الملاحظات الرسمية التي تقدمها Hugging Face.
+
+كما نود أن ندرج هنا محتوى مثيرًا للاهتمام تم إنشاؤه بواسطة المجتمع.
+إذا كتبت دفتر ملاحظات يستفيد من 🤗 Transformers وتود إدراجه هنا، فيُرجى فتح طلب سحب حتى يمكن تضمينه ضمن دفاتر ملاحظات المجتمع.
+
+
+## دفاتر ملاحظات Hugging Face 🤗
+
+### دفاتر ملاحظات التوثيق
+
+يمكنك فتح أي صفحة من صفحات التوثيق كدفتر ملاحظات في Colab (يوجد زر مباشرة على تلك الصفحات) ولكنها مدرجة هنا أيضًا إذا كنت بحاجة إليها:
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [جولة سريعة في المكتبة](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)  | عرض لمختلف واجهات برمجة التطبيقات في Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)|
+| [ملخص المهام](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)  | كيفية تشغيل نماذج مكتبة Transformers مهمة تلو الأخرى |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)|
+| [معالجة البيانات مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)  | كيفية استخدام محلل لغوي لمعالجة بياناتك مسبقًا |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)|
+| [الضبط الدقيق لنموذج مُدرَّب مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | كيفية استخدام المدرب لضبط نموذج مُدرَّب مسبقًا بدقة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|
+| [ملخص للمحللات اللغوية](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | الاختلافات بين خوارزمية المحلل اللغوي |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|
+| [النماذج متعددة اللغات](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | كيفية استخدام النماذج متعددة اللغات للمكتبة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|
+
+
+### أمثلة PyTorch
+
+#### معالجة اللغة الطبيعية[[pytorch-nlp]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | كيفية تدريب واستخدام محللك اللغوي الخاص بك  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
+| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | كيفية البدء بسهولة في استخدام المحولات  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)|
+| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|
+| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|
+| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|
+| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|
+| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)|
+| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)|
+| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)|
+| [كيفية إنشاء نص (مع قيود)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| كيفية توجيه إنشاء اللغة باستخدام القيود التي يوفرها المستخدم | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)|
+| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)|
+
+#### رؤية الكمبيوتر[[pytorch-cv]]
+
+| دفتر الملاحظات                                                                                                                                                                   | الوصف                                                                                                            |                                                                                                                                                                                                            |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:|
+| [كيفية ضبط نموذج بدقة على تصنيف الصور (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | يوضح كيفية معالجة البيانات مسبقًا باستخدام Torchvision وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف الصور (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Albumentations وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف الصور (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | يوضح كيفية معالجة البيانات مسبقًا باستخدام Kornia وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)|
+| [كيفية إجراء الكشف عن الأشياء بدون لقطات مع OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | يوضح كيفية إجراء الكشف عن الأشياء بدون لقطات على الصور باستخدام استعلامات نصية                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)|
+| [كيفية ضبط نموذج وصف الصور بدقة](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | يوضح كيفية ضبط BLIP بدقة لوصف الصور على مجموعة بيانات مخصصة                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)|
+| [كيفية بناء نظام تشابه الصور مع Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | يوضح كيفية بناء نظام تشابه الصور                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)|
+| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)|
+| [كيفية ضبط نموذج VideoMAE بدقة على تصنيف الفيديو](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج VideoMAE مُدرَّب مسبقًا بدقة على تصنيف الفيديو                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)|
+
+
+#### الصوت[[pytorch-audio]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [كيفية ضبط نموذج التعرف على الكلام باللغة الإنجليزية بدقة](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)|
+| [كيفية ضبط نموذج التعرف على الكلام بأي لغة بدقة](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا متعدد اللغات بدقة على Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف الصوت](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)|
+
+
+#### التسلسلات البيولوجية[[pytorch-bio]]
+
+| دفتر الملاحظات     | الوصف                                                                             |   |   |
+|:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
+| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) |
+| [كيفية إنشاء طيات بروتينية](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | شاهد كيفية الانتقال من تسلسل البروتين إلى نموذج بروتين كامل وملف PDB                | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) |
+| [كيفية ضبط نموذج محول النيوكليوتيدات بدقة](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | شاهد كيفية ترميز الحمض النووي وضبط نموذج "لغة" الحمض النووي مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) |
+| [ضبط نموذج محول النيوكليوتيدات بدقة باستخدام LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | تدريب نماذج DNA أكبر بكثير بطريقة فعالة من حيث الذاكرة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) |
+
+
+#### طرائق أخرى[[pytorch-other]]
+
+| دفتر الملاحظات     | الوصف                                                                             |   |   |
+|:----------|:----------------------------------------------------------------------------------------|:-------------|------:|
+| [التنبؤ الاحتمالي بالسلاسل الزمنية](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | شاهد كيفية تدريب Time Series Transformer على مجموعة بيانات مخصصة                            | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) |
+
+#### دفاتر ملاحظات  الأدوات المساعدة [[pytorch-utility]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [كيفية تصدير النموذج إلى ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| تسليط الضوء على كيفية التصدير وتشغيل أعباء عمل الاستدلال من خلال ONNX | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)|
+| [كيفية استخدام المعايير](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| كيفية قياس أداء النماذج باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)|
+
+### أمثلة TensorFlow
+
+#### معالجة اللغة الطبيعية[[tensorflow-nlp]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | كيفية تدريب واستخدام محللك اللغوي الخاص بك  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
+| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)   | كيفية البدء بسهولة في استخدام المحولات  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)|
+| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)|
+
+#### رؤية الكمبيوتر[[tensorflow-cv]]
+
+| دفتر الملاحظات                                                                                                                                                 | الوصف                                                                                         |   |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-------------|------:|
+| [كيفية ضبط نموذج بدقة على تصنيف الصور](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)            | يوضح كيفية معالجة البيانات مسبقًا وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور   | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)|
+| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)|
+
+#### التسلسلات البيولوجية[[tensorflow-bio]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) |
+
+#### دفاتر ملاحظات  الأدوات المساعدة [[tensorflow-utility]]
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [كيفية تدريب نماذج TF/Keras على TPU](https://github.com/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | شاهد كيفية التدريب بسرعة عالية على أجهزة TPU من Google | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) |
+
+### دفاتر ملاحظات Optimum
+
+🤗  [Optimum](https://github.com/huggingface/optimum) هو امتداد لـ 🤗 Transformers، يوفر مجموعة من أدوات تحسين الأداء التي تمكن من تحقيق أقصى قدر من الكفاءة لتدريب وتشغيل النماذج على الأجهزة المستهدفة.
+
+| دفتر الملاحظات     |      الوصف      |   |   |
+|:----------|:-------------|:-------------|------:|
+| [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)|
+| [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
+| [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|
+
+
+## دفاتر ملاحظات المجتمع:
+
+تتوفر المزيد من دفاتر الملاحظات التي طورها المجتمع [هنا](https://hf.co/docs/transformers/community#community-notebooks).
+
diff --git a/docs/transformers/docs/source/ar/pad_truncation.md b/docs/transformers/docs/source/ar/pad_truncation.md
new file mode 100644
index 0000000000000000000000000000000000000000..d41d4f9c6b453b577b0b1309df87aaba9683f50e
--- /dev/null
+++ b/docs/transformers/docs/source/ar/pad_truncation.md
@@ -0,0 +1,52 @@
+# الحشو والتقليم 
+
+غالبًا ما تختلف مدخلات الدُفعات في الطول، لذا لا يمكن تحويلها إلى مصفوفات ذات حجم ثابت .يُعدّ الحشو والتقليم هما استراتيجيتان للتعامل مع هذه المشكلة، لإنشاء مصفوفات مستطيلة من مجموعات ذات أطوال مختلفة. ويضيف الحشو رمز **حشو** خاص لضمان أن يكون للتسلسلات الأقصر نفس طول أطول تسلسل في الدفعة أو الطول الأقصى الذي يقبله النموذج. ويعمل التقليم عكس ذلك بتقليم التسلسلات الطويلة.
+
+في معظم الحالات، ييُعدّ حشو دُفعتك إلى طول أطول تسلسل فيها وتقليمها إلى الطول الأقصى المقبول من النموذج حلًا فعالًا. ومع ذلك، تدعم واجهة برمجة التطبيقات المزيد من الاستراتيجيات إذا كنت بحاجة إليها. هناك ثلاثة معامﻻت تحتاجها لفهم آلية العمل: `padding`، و`truncation`، و`max_length`.
+
+يحكم معامل `padding` عملية الحشو. يمكن أن يكون قيمة منطقية أو نصية:
+
+  - `True` أو `'longest'`: الحشو إلى أطول تسلسل في الدفعة (لا يتم تطبيق الحشو عند تقديم تسلسل واحد فقط).
+  - `'max_length'`: الحشو إلى طول محدد بواسطة معامل `max_length` أو الطول الأقصى الذي يقبله
+    النموذج إذا لم يتم توفير `max_length` (`max_length=None`). سيظل الحشو مطبقًا إذا قدمت تسلسلًا واحدًا فقط.
+  - `False` أو `'do_not_pad'`: لا يتم تطبيق أي حشو. هذا هو السلوك الافتراضي.
+
+تحكم معامل `truncation` عملية التقليم. يمكن أن يكون قيمة منطقية أو نصية:
+
+  -قيمة `True` أو `'longest_first'` : تقليم التسلسلات إلى طول أقصى مُحدد بواسطة معامل `max_length`، أو أقصى طول يقبله النموذج في حال عدم تحديد طول مُحدد من قبل المستخدم  (`max_length=None`). ستتم عملية التقليم إزالة رمز تلو الآخر، بدءًا من أطول تسلسل في الزوج، إلى أن يصل الطول إلى القيمة المُحددة. 
+  -قيمة `'only_second'`: اقطع إلى طول أقصى محدد بواسطة معامل `max_length` أو أقصى طول يقبله النموذج إذا لم يتم توفير `max_length` (`max_length=None`). هذا سيقلم فقط الجملة الثانية من الزوج إذا تم توفير زوج من التسلسلات (أو دُفعة من أزواج التسلسلات).
+  -قيمة `'only_first'`: تقليم الجملة الأولى فقط من الزوج عند تقديم زوج من التسلسلات (أو دُفعة من أزواج التسلسلات) إلى طول أقصى مُحدد بواسطة حجة `max_length`، أو أقصى طول يقبله النموذج في حال عدم تحديد طول مُحدد من قبل المستخدم  (`max_length=None`). 
+  -قيمة `False` أو `'do_not_truncate'`: لا يتم تطبيق أي تقليم. هذا هو السلوك الافتراضي.
+``
+
+يحكم معامل  `max_length` طول الحشو والتقليم. يمكن أن يكون عدد صحيح أو `None`، وعندها يُحدد افتراضيًا إلى الطول الأقصى الذي يمكن أن يقبله النموذج. إذا لم يكن للنموذج طول إدخال أقصى محدد، يتم إلغاء تنشيط التقليم أو الحشو إلى `max_length`.
+
+يلخّص الجدول التالي الطريقة المُوصى بها لإعداد الحشو والتقليم. إذا كنت تستخدم أزواج تسلسلات الإدخال في أي من الأمثلة التالية، فيمكنك استبدال `truncation=True` بـ `STRATEGY` المحدد في `['only_first'، 'only_second'، 'longest_first']`، أي `truncation='only_second'` أو `truncation='longest_first'` للتحكم في كيفية تقليم كلا التسلسلين في الزوج كما هو موضّح سابقًا.
+<!-- This file is automatically generated, do not modify manually. -->
+
+# حيل الترميز
+
+هناك العديد من الاستراتيجيات لترميز دفعات الجمل. فيما يلي بعض الأمثلة على ذلك.
+
+| الترميز                           | الحشو                           | التعليمات                                                                                 |
+|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------|
+| لا ترميز                           | لا حشو                           | `tokenizer(batch_sentences)`                                                           |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True)` أو                                          |
+|                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | `tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      | الحشو إلى طول محدد                | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+|                                      | الحشو إلى مضاعف لقيمة معينة      | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)`                        |
+| الترميز إلى الحد الأقصى لطول إدخال النموذج | لا حشو                           | `tokenizer(batch_sentences, truncation=True)` أو                                       |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True, truncation=True)` أو                         |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | `tokenizer(batch_sentences, padding='max_length', truncation=True)` أو                 |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      | الحشو إلى طول محدد                | غير ممكن                                                                                |
+| الترميز إلى طول محدد                | لا حشو                           | `tokenizer(batch_sentences, truncation=True, max_length=42)` أو                        |
+|                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      | الحشو إلى الحد الأقصى للتسلسل في الدفعة | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` أو          |
+|                                      |                                   | `tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      | الحشو إلى الحد الأقصى لطول إدخال النموذج | غير ممكن                                                                                |
+|                                      | الحشو إلى طول محدد                | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` أو  |
+|                                      |                                   | `tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/peft.md b/docs/transformers/docs/source/ar/peft.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5f050ade427ca4355b79e51eae403c5c9ec1a96
--- /dev/null
+++ b/docs/transformers/docs/source/ar/peft.md
@@ -0,0 +1,250 @@
+# تحميل المحوّلات باستخدام 🤗 PEFT
+
+[[open-in-colab]]
+
+تقنية "التدريب الدقيق ذو الكفاءة البارامتيرية" (PEFT)](https://huggingface.co/blog/peft) تقوم بتجميد معلمات النموذج المُدرب مسبقًا أثناء الضبط الدقيق وتضيف عدد صغير من المعلمات القابلة للتدريب (المحولات) فوقه. يتم تدريب المحوّلات لتعلم معلومات خاصة بالمهام. وقد ثبت أن هذا النهج فعال للغاية من حيث استخدام الذاكرة مع انخفاض استخدام الكمبيوتر أثناء إنتاج نتائج قمماثلة للنموذج مضبوط دقيقًا بالكامل.
+
+عادة ما تكون المحولات المدربة باستخدام PEFT أصغر بمقدار كبير من حيث الحجم من النموذج الكامل، مما يجعل من السهل مشاركتها وتخزينها وتحميلها.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">تبلغ أوزان المحول لطراز OPTForCausalLM المخزن على Hub حوالي 6 ميجابايت مقارنة بالحجم الكامل لأوزان النموذج، والتي يمكن أن تكون حوالي 700 ميجابايت.</figcaption>
+</div>
+
+إذا كنت مهتمًا بمعرفة المزيد عن مكتبة 🤗 PEFT، فراجع [الوثائق](https://huggingface.co/docs/peft/index).
+
+## الإعداد
+
+ابدأ بتثبيت 🤗 PEFT:
+
+```bash
+pip install peft
+```
+
+إذا كنت تريد تجربة الميزات الجديدة تمامًا، فقد تكون مهتمًا بتثبيت المكتبة من المصدر:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## نماذج PEFT المدعومة
+
+يدعم 🤗 Transformers بشكلٍ أصلي بعض طرق PEFT، مما يعني أنه يمكنك تحميل أوزان المحول المخزنة محليًا أو على Hub وتشغيلها أو تدريبها ببضع سطور من التعليمات البرمجية. الطرق المدعومة هي:
+
+- [محولات الرتبة المنخفضة](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+إذا كنت تريد استخدام طرق PEFT الأخرى، مثل تعلم المحث أو ضبط المحث، أو حول مكتبة 🤗 PEFT بشكل عام، يرجى الرجوع إلى [الوثائق](https://huggingface.co/docs/peft/index).
+
+## تحميل محول PEFT
+
+لتحميل نموذج محول PEFT واستخدامه من 🤗 Transformers، تأكد من أن مستودع Hub أو الدليل المحلي يحتوي على ملف `adapter_config.json` وأوزان المحوّل، كما هو موضح في صورة المثال أعلاه. بعد ذلك، يمكنك تحميل نموذج محوّل PEFT باستخدام فئة `AutoModelFor`. على سبيل المثال، لتحميل نموذج محول PEFT للنمذجة اللغوية السببية:
+
+1. حدد معرف النموذج  لPEFT
+2. مرره إلى فئة [`AutoModelForCausalLM`]
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+يمكنك تحميل محول PEFT باستخدام فئة `AutoModelFor` أو فئة النموذج الأساسي مثل `OPTForCausalLM` أو `LlamaForCausalLM`.
+
+</Tip>
+
+يمكنك أيضًا تحميل محول PEFT عن طريق استدعاء طريقة `load_adapter`:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+راجع قسم [وثائق API](#transformers.integrations.PeftAdapterMixin) أدناه لمزيد من التفاصيل.
+
+## التحميل في 8 بت أو 4 بت
+
+راجع قسم [وثائق API](#transformers.integrations.PeftAdapterMixin) أدناه لمزيد من التفاصيل.
+
+## التحميل في 8 بت أو 4 بت
+
+يدعم تكامل `bitsandbytes` أنواع بيانات الدقة 8 بت و4 بت، والتي تكون مفيدة لتحميل النماذج الكبيرة لأنها توفر  مساحة في الذاكرة (راجع دليل تكامل `bitsandbytes` [guide](./quantization#bitsandbytes-integration) لمعرفة المزيد). أضف المعلمات`load_in_8bit` أو `load_in_4bit` إلى [`~PreTrainedModel.from_pretrained`] وقم بتعيين `device_map="auto"` لتوزيع النموذج بشكل فعال على الأجهزة لديك:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+```
+
+## إضافة محول جديد
+
+يمكنك استخدام الدالة [`~peft.PeftModel.add_adapter`] لإضافة محوّل جديد إلى نموذج يحتوي بالفعل على محوّل آخر طالما أن المحول الجديد  مطابقًا للنوع الحالي. على سبيل المثال، إذا كان لديك محول LoRA موجود مرتبط بنموذج:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import LoraConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+لإضافة محول جديد:
+
+```py
+# قم بتعليق محول جديد بنفس التكوين
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+الآن يمكنك استخدام [`~peft.PeftModel.set_adapter`] لتعيين المحول الذي سيتم استخدامه:
+
+```py
+# استخدم adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# استخدم adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## تمكين وتعطيل المحولات
+
+بمجرد إضافة محول إلى نموذج، يمكنك تمكين أو تعطيل وحدة المحول. لتمكين وحدة المحول:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# لبدء تشغيله بأوزان عشوائية
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+لإيقاف تشغيل وحدة المحول:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## تدريب محول PEFT
+
+يدعم محول PEFT فئة [`Trainer`] بحيث يمكنك تدريب محول لحالتك الاستخدام المحددة. فهو يتطلب فقط إضافة بضع سطور أخرى من التعليمات البرمجية. على سبيل المثال، لتدريب محول LoRA:
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج دقيق باستخدام [`Trainer`، فراجع البرنامج التعليمي](training) لضبط نموذج مُدرب مسبقًا.
+
+</Tip>
+
+1. حدد تكوين المحول باستخدام نوع المهمة والمعاملات الزائدة (راجع [`~peft.LoraConfig`] لمزيد من التفاصيل حول وظيفة هذه  المعلمات).
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM"،
+)
+```
+
+2. أضف المحول إلى النموذج.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. الآن يمكنك تمرير النموذج إلى [`Trainer`]!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+لحفظ محول المدرب وتحميله مرة أخرى:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+## إضافة طبقات قابلة للتدريب إضافية إلى محول PEFT
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+## إضافة طبقات قابلة للتدريب إضافية إلى محول PEFT
+
+يمكنك أيضًا إجراء تدريب دقيق لمحوّلات قابلة للتدريب إضافية فوق نموذج يحتوي بالفعل على محوّلات عن طريق تمرير معلم `modules_to_save` في تكوين PEFT الخاص بك. على سبيل المثال، إذا كنت تريد أيضًا ضبط دقيق لرأس النموذج اللغوي`lm_head` فوق نموذج بمحوّل LoRA:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import LoraConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    modules_to_save=["lm_head"]،
+)
+
+model.add_adapter(lora_config)
+```
+
+## وثائق API
+
+[[autodoc]] integrations.PeftAdapterMixin
+    - load_adapter
+    - add_adapter
+    - set_adapter
+    - disable_adapters
+    - enable_adapters
+    - active_adapters
+    - get_adapter_state_dict
+
+
+
+
+<!--
+TODO: (@younesbelkada @stevhliu)
+-   Link to PEFT docs for further details
+-   Trainer
+-   8-bit / 4-bit examples ?
+-->
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/perplexity.md b/docs/transformers/docs/source/ar/perplexity.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7d2a19d2c837fb4ae2f161b78e125f39db40721
--- /dev/null
+++ b/docs/transformers/docs/source/ar/perplexity.md
@@ -0,0 +1,94 @@
+# التعقيد اللغوي للنماذج ذات الطول الثابت
+
+[[open-in-colab]]
+
+ التعقيد اللغوي (PPL) هي واحدة من أكثر المقاييس شيوعًا لتقييم نماذج اللغة. قبل الخوض في التفاصيل، يجب أن نلاحظ أن المقياس ينطبق تحديدًا على نماذج اللغة الكلاسيكية (يُطلق عليها أحيانًا نماذج اللغة التلقائية المرجعية أو السببية) وهي غير محددة جيدًا لنماذج اللغة المقنعة مثل BERT (راجع [ملخص النماذج](model_summary)).
+
+تُعرَّف التعقيد اللغوي على أنها الأس المُرفوع لقيمة متوسط اللوغاريتم الاحتمالي لمتتالية. إذا كان لدينا تسلسل رمزي \\(X = (x_0, x_1, \dots, x_t)\\)، فإن حيرة \\(X\\) هي،
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+حيث \\(\log p_\theta (x_i|x_{<i})\\) هو اللوغاريتم الاحتمالي للرمز i بشرط الرموز السابقة \\(x_{<i}\\) وفقًا لنموذجنا. ومن الناحية البديهية، يمكن اعتبارها تقييمًا لقدرة النموذج على التنبؤ بالتساوي بين مجموعة من الرموز المحددة في مجموعة من البيانات. ومن المهم الإشارة إلى أن عملية التمييز له تأثير مباشرًا على حيرة النموذج،ويجب مراعاتها دائمًا عند مقارنة النماذج المختلفة.
+
+كما أنها تعادل الأس المُرفوع لقيمة الانتروبيا المتقاطعة بين البيانات وتنبؤات النموذج. لمزيد من الفهم حول مفهوم التعقيد اللغوي وعلاقتها بـ Bits Per Character (BPC) وضغط البيانات، يُرجى مراجعة [التدوينة المفيدة على The Gradient](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/).
+
+## حساب PPL مع النماذج ذات الطول الثابت
+
+إذا لم نكن مقيدين بحجم سياق النموذج، فسنقوم بتقييم التعقيد اللغوي للنموذج عن طريق تحليل التسلسل تلقائيًا والشرط على التسلسل الفرعي السابق بالكامل في كل خطوة، كما هو موضح أدناه.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+لكن عند التعامل مع النماذج التقريبية، نواجه عادةً قيدًا على عدد الرموز التي يمكن للنموذج معالجتها. على سبيل المثال، تحتوي أكبر نسخة من [GPT-2](model_doc/gpt2) على طول ثابت يبلغ 1024 رمزًا، لذا لا يمكننا حساب \\(p_\theta(x_t|x_{<t})\\) مباشرة عندما تكون \\(t\\) أكبر من 1024.
+
+بدلاً من ذلك، يتم عادةً تقسيم التسلسل إلى تسلسلات فرعية مساوية لحجم الإدخال الأقصى للنموذج. فإذا كان حجم الإدخال الأقصى للنموذج هو \\(k\\)، فإننا نقرب احتمال الرمز \\(x_t\\) عن طريق الاشتقاق الشرطي فقط بالنسبة إلى \\(k-1\\) من الرموز التي تسبقه بدلاً من السياق بأكمله. وعند تقييم حيرة النموذج  لتسلسل ما، قد يبدو من المغري تقسيم التسلسل إلى أجزاء منفصلة وجمع مجموع دوال اللوغاريتم لكل جزء بشكل مستقل، لكن هذا الأسلوب ليس الأمثل.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+تتميز هذه الطريقة بسرعة حسابها نظرًا لإمكانية حساب درجة التعقيد اللغوي لكل جزء بمسح واحد للأمام، إلا أنها تُعدّ تقريبًا ضعيفًا لدرجة التعقيد اللغوي المُحلّلة بشكل كامل، وعادةً ما تؤدي إلى درجة تعقيد لغوي أعلى (أسوأ) لأن النموذج سيكون لديه سياق أقل في معظم خطوات التنبؤ.
+
+بدلاً من ذلك، يجب تقييم درجة التعقيد اللغوي للنماذج ذات الطول الثابت باستخدام إستراتيجية النافذة المنزلقة. وينطوي هذا على تحريك نافذة السياق بشكل متكرر بحيث يكون للنموذج سياق أكبر عند إجراء كل تنبؤ.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+هذا تقريب أقرب للتفكيك الحقيقي لاحتمالية التسلسل وسيؤدي عادةً إلى نتيجة أفضل.لكن الجانب السلبي هو أنه يتطلب تمريرًا للأمام لكل رمز في مجموعة البيانات. حل وسط عملي مناسب هو استخدام نافذة منزلقة بخطوة، بحيث يتم تحريك السياق بخطوات أكبر بدلاً من الانزلاق بمقدار 1 رمز في كل مرة. مما يسمح بإجراء الحساب بشكل أسرع مع إعطاء النموذج سياقًا كبيرًا للتنبؤات في كل خطوة.
+
+## مثال: حساب التعقيد اللغوي مع GPT-2 في 🤗 Transformers
+
+دعونا نوضح هذه العملية مع GPT-2.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "openai-community/gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+سنقوم بتحميل مجموعة بيانات WikiText-2 وتقييم التعقيد اللغوي باستخدام بعض إستراتيجيات مختلفة النافذة المنزلقة. نظرًا لأن هذه المجموعة البيانات الصغيرة ونقوم فقط بمسح واحد فقط للمجموعة، فيمكننا ببساطة تحميل مجموعة البيانات وترميزها بالكامل في الذاكرة.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+مع 🤗 Transformers، يمكننا ببساطة تمرير `input_ids` كـ `labels` إلى نموذجنا، وسيتم إرجاع متوسط  احتمالية السجل السالب لكل رمز كخسارة. ومع ذلك، مع نهج النافذة المنزلقة، هناك تداخل في الرموز التي نمررها إلى النموذج في كل تكرار. لا نريد تضمين احتمالية السجل للرموز التي نتعامل معها كسياق فقط في خسارتنا، لذا يمكننا تعيين هذه الأهداف إلى `-100` بحيث يتم تجاهلها. فيما يلي هو مثال على كيفية القيام بذلك بخطوة تبلغ `512`. وهذا يعني أن النموذج سيكون لديه 512 رمزًا على الأقل للسياق عند حساب الاحتمالية الشرطية لأي رمز واحد (بشرط توفر 512 رمزًا سابقًا متاحًا للاشتقاق).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # قد تكون مختلفة عن الخطوة في الحلقة الأخيرة
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # يتم حساب الخسارة باستخدام CrossEntropyLoss الذي يقوم بالمتوسط على التصنيفات الصحيحة
+        # لاحظ أن النموذج يحسب الخسارة على trg_len - 1 من التصنيفات فقط، لأنه يتحول داخليًا إلى اليسار بواسطة 1.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+يعد تشغيل هذا مع طول الخطوة مساويًا لطول الإدخال الأقصى يعادل لاستراتيجية النافذة غير المنزلقة وغير المثلى التي ناقشناها أعلاه. وكلما صغرت الخطوة، زاد السياق الذي سيحصل عليه النموذج في عمل كل تنبؤ، وكلما كانت التعقيد اللغوي المُبلغ عنها أفضل عادةً.
+
+عندما نقوم بتشغيل ما سبق باستخدام `stride = 1024`، أي بدون تداخل، تكون  درجة التعقيد اللغوي الناتجة هي `19.44`، وهو ما يماثل `19.93` المبلغ عنها في ورقة GPT-2. من خلال استخدام `stride = 512` وبالتالي استخدام إستراتيجية النافذة المنزلقة، ينخفض هذا إلى `16.45`. هذه النتيجة ليست فقط أفضل، ولكنها محسوبة بطريقة أقرب إلى التحليل التلقائي الحقيقي لاحتمالية التسلسل.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/philosophy.md b/docs/transformers/docs/source/ar/philosophy.md
new file mode 100644
index 0000000000000000000000000000000000000000..09de2991fba28d3e650aafc30bed104396c50894
--- /dev/null
+++ b/docs/transformers/docs/source/ar/philosophy.md
@@ -0,0 +1,49 @@
+# الفلسفة
+
+تُعد 🤗 Transformers مكتبة برمجية ذات رؤية واضحة صُممت من أجل:
+
+- الباحثون والمُتعلّمون في مجال التعلم الآلي ممن يسعون لاستخدام أو دراسة أو تطوير نماذج Transformers واسعة النطاق.
+- مُطبّقي تعلم الآلة الذين يرغبون في ضبط تلك النماذج أو تشغيلها في بيئة إنتاجية، أو كليهما.
+- المهندسون الذين يريدون فقط تنزيل نموذج مُدرب مسبقًا واستخدامه لحل مهمة تعلم آلي معينة.
+
+تم تصميم المكتبة مع الأخذ في الاعتبار هدفين رئيسيين:
+
+1. سهولة وسرعة الاستخدام:
+
+  - تمّ تقليل عدد المفاهيم المُجردة التي يتعامل معها المستخدم إلى أدنى حد والتي يجب تعلمها، وفي الواقع، لا توجد مفاهيم مُجردة تقريبًا، فقط ثلاث فئات أساسية مطلوبة لاستخدام كل نموذج: [الإعدادات](main_classes/configuration)، [نماذج](main_classes/model)، وفئة ما قبل المعالجة ([مُجزّئ لغوي](main_classes/tokenizer) لـ NLP، [معالج الصور](main_classes/image_processor) للرؤية، [مستخرج الميزات](main_classes/feature_extractor) للصوت، و [معالج](main_classes/processors) للمدخﻻت متعددة الوسائط).
+  - يمكن تهيئة جميع هذه الفئات بطريقة بسيطة وموحدة من خلال نماذج مُدربة مسبقًا باستخدام الدالة الموحدة  `from_pretrained()`  والتي تقوم بتنزيل (إذا لزم الأمر)، وتخزين وتحميل  كل من: فئة النموذج المُراد استخدامه والبيانات المرتبطة ( مُعاملات الإعدادات، ومعجم للمُجزّئ اللغوي،وأوزان النماذج) من نقطة  تدقيق مُحددة مُخزّنة على [Hugging Face Hub](https://huggingface.co/models) أو ن من نقطة تخزين خاصة بالمستخدم.
+  - بالإضافة إلى هذه الفئات الأساسية الثلاث، توفر المكتبة واجهتي برمجة تطبيقات: [`pipeline`] للاستخدام السريع لأحد النماذج لأداء استنتاجات على مهمة مُحددة، و [`Trainer`] للتدريب السريع أو الضبط الدقيق لنماذج PyTorch  (جميع نماذج TensorFlow متوافقة مع `Keras.fit`).
+  - نتيجة لذلك، هذه المكتبة ليست صندوق أدوات متعدد الاستخدامات من الكتل الإنشائية للشبكات العصبية. إذا كنت تريد توسيع أو البناء على المكتبة، فما عليك سوى استخدام Python و PyTorch و TensorFlow و Keras العادية والوراثة من الفئات الأساسية للمكتبة لإعادة استخدام الوظائف مثل تحميل النموذج وحفظه. إذا كنت ترغب في معرفة المزيد عن فلسفة الترميز لدينا للنماذج، فراجع منشور المدونة الخاص بنا [Repeat Yourself](https://huggingface.co/blog/transformers-design-philosophy).
+
+2. تقديم نماذج رائدة في مجالها مع أداء قريب قدر الإمكان من النماذج الأصلية:
+
+  - نقدم مثالًا واحدًا على الأقل لكل بنية تقوم بإعادة إنتاج نتيجة مقدمة من المؤلفين الرسميين لتلك البنية.
+  - عادةً ما تكون الشفرة قريبة قدر الإمكان من قاعدة الشفرة الأصلية، مما يعني أن بعض شفرة PyTorch قد لا تكون "بأسلوب PyTorch" كما يمكن أن تكون نتيجة لكونها شفرة TensorFlow محولة والعكس صحيح.
+
+بعض الأهداف الأخرى:
+
+- كشف تفاصيل النماذج الداخلية بشكل متسق قدر الإمكان:
+
+  -نتيح الوصول، باستخدام واجهة برمجة واحدة، إلى جميع الحالات المخفية (Hidden-States) وأوزان الانتباه (Attention Weights).
+  - تم توحيد واجهات برمجة التطبيقات الخاصة بفئات المعالجة المسبقة والنماذج الأساسية لتسهيل التبديل بين النماذج.
+
+- دمج مجموعة مختارة من الأدوات الواعدة لضبط النماذج بدقة (Fine-tuning) ودراستها:
+
+  - طريقة بسيطة ومتسقة لإضافة رموز جديدة إلى مفردات التضمينات (Embeddings) لضبط النماذج بدقة.
+  - طرق سهلة لإخفاء (Masking) وتقليم (Pruning) رؤوس المحولات (Transformer Heads).
+
+- التبديل بسهولة بين PyTorch و TensorFlow 2.0 و Flax، مما يسمح بالتدريب باستخدام إطار واحد والاستدلال باستخدام إطار آخر.
+
+## المفاهيم الرئيسية
+
+تعتمد المكتبة على ثلاثة أنواع من الفئات لكل نموذج:
+
+- **فئات النماذج** يمكن أن تكون نماذج PyTorch ([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module))، أو نماذج Keras ([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model))، أو نماذج JAX/Flax ([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)) التي تعمل مع الأوزان المُدربة مسبقًا المقدمة في المكتبة.
+- **فئات الإعداد** تخزن معلمات التهيئة المطلوبة لبناء نموذج (مثل عدد الطبقات وحجم  الطبقة المخفية). أنت لست مضطرًا دائمًا إلى إنشاء مثيل لهذه الفئات بنفسك. على وجه الخصوص، إذا كنت تستخدم نموذجًا مُدربًا مسبقًا دون أي تعديل، فإن إنشاء النموذج سيهتم تلقائيًا تهيئة الإعدادات (والذي يعد جزءًا من النموذج).
+- **فئات ما قبل المعالجة** تحويل البيانات الخام إلى تنسيق مقبول من قبل النموذج. يقوم [المعالج](main_classes/tokenizer) بتخزين المعجم لكل نموذج ويقدم طرقًا لتشفير وفك تشفير السلاسل في قائمة من مؤشرات تضمين الرموز ليتم إطعامها للنموذج. تقوم [معالجات الصور](main_classes/image_processor) بمعالجة إدخالات الرؤية، وتقوم [مستخلصات الميزات](main_classes/feature_extractor) بمعالجة إدخالات الصوت، ويقوم [المعالج](main_classes/processors) بمعالجة الإدخالات متعددة الوسائط.
+
+يمكن تهيئة جميع هذه الفئات من نسخ مُدربة مسبقًا، وحفظها محليًا، ومشاركتها على منصة Hub عبر ثلاث طرق:
+
+- تسمح لك الدالة  `from_pretrained()` بتهيئة النموذج وتكويناته وفئة المعالجة المسبقة من إصدار مُدرب مسبقًا إما يتم توفيره بواسطة المكتبة نفسها (يمكن العثور على النماذج المدعومة على [Model Hub](https://huggingface.co/models)) أو مخزنة محليًا (أو على خادم) بواسطة المستخدم.
+- تسمح لك الدالة  `save_pretrained()` بحفظ النموذج، وتكويناته وفئة المعالجة المسبقة محليًا، بحيث يمكن إعادة تحميله باستخدام الدالة `from_pretrained()`.
+- تسمح لك `push_to_hub()` بمشاركة نموذج وتكويناتهوفئة المعالجة المسبقة على Hub، بحيث يمكن الوصول إليها بسهولة من قبل الجميع.
diff --git a/docs/transformers/docs/source/ar/pipeline_tutorial.md b/docs/transformers/docs/source/ar/pipeline_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd713a6533f6e3831bdde6116f16bbdcf573f57
--- /dev/null
+++ b/docs/transformers/docs/source/ar/pipeline_tutorial.md
@@ -0,0 +1,315 @@
+# خطوط الأنابيب الاستدلال
+
+يجعل [`pipeline`] من السهل استخدام أي نموذج من [Hub](https://huggingface.co/models) للاستدلال لأي مهام خاصة باللغة أو الرؤية الحاسوبية أو الكلام أو المهام متعددة الوسائط. حتى إذا لم يكن لديك خبرة في طريقة معينة أو لم تكن على دراية بالرمز الأساسي وراء النماذج، يمكنك مع ذلك استخدامها للاستدلال باستخدام [`pipeline`]! سوف يُعلمك هذا البرنامج التعليمي ما يلي:
+
+* استخدام [`pipeline`] للاستدلال.
+* استخدم مُجزّئ أو نموذجًا محددًا.
+* استخدم [`pipeline`] للمهام الصوتية والبصرية والمتعددة الوسائط.
+
+<Tip>
+
+اطلع على وثائق [`pipeline`] للحصول على القائمة كاملة بالمهام المدعومة والمعلمات المتاحة.
+
+</Tip>
+
+## استخدام الأنابيب
+
+على الرغم من أن لكل مهمة أنبوب [`pipeline`] خاص بها، إلا أنه من الأبسط استخدام تجريد خط الأنابيب العام [`pipeline`] الذي يحتوي على جميع خطوط الأنابيب الخاصة بالمهمة. يقوم [`pipeline`] تلقائيًا بتحميل نموذج افتراضي وفئة معالجة مسبقة قادرة على الاستدلال لمهمتك. دعنا نأخذ مثال استخدام [`pipeline`] للتعرف التلقائي على الكلام (ASR)، أو تحويل الكلام إلى نص.
+
+1. ابدأ بإنشاء [`pipeline`] وحدد مهمة الاستدلال:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition")
+```
+
+2. مرر إدخالك إلى [`pipeline`]. في حالة التعرف على الكلام، يكون هذا ملف إدخال صوتي:
+
+```py
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+لم تحصل على النتيجة التي تريدها؟ تحقق من بعض [نماذج التعرف على الكلام الأكثر تنزيلًا](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
+على Hub لمعرفة ما إذا كان بإمكانك الحصول على نسخة منقحة أفضل.
+
+لنَجرب نموذج [Whisper large-v2](https://huggingface.co/openai/whisper-large) من OpenAI. تم إصدار Whisper بعد عامين من إصدار Wav2Vec2، وتم تدريبه على ما يقرب من 10 أضعاف كمية البيانات. وبهذه الصفة، فإنه يتفوق على Wav2Vec2 في معظم معظم المقاييس. كما أنه يمتلك ميزة إضافية وهي في التنبؤ بعلامات الترقيم وحالة الأحرف، والتي لا يمكن تحقيقها مع Wav2Vec2.
+
+دعونا نجربها هنا لنرى كيف تؤدي:
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+الآن تبدو هذه النتيجة أكثر دقة! لمقارنة عميقة حول Wav2Vec2 مقابل Whisper، راجع [دورة Audio Transformers](https://huggingface.co/learn/audio-course/chapter5/asr_models).
+نشجعك بشدة على التحقق من Hub للحصول على نماذج بلغات مختلفة، ونماذج متخصصة في مجالك، وأكثر من ذلك.
+يمكنك التحقق من نتائج النموذج ومقارنتها مباشرة من متصفحك على Hub لمعرفة ما إذا كان يناسبها
+أو التعامل مع الحالات الخاصة بشكل أفضل من غيرها.
+وإذا لم تجد نموذجًا لحالتك الاستخدام، فيمكنك دائمًا البدء في [التدريب](training) الخاص بك!
+
+إذا كان لديك عدة مدخلات، فيمكنك تمرير إدخالك كقائمة:
+
+```py
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+تعد خطوط الأنابيب مثالية للتجريب نظرًا لأن التبديل من نموذج إلى آخر أمر بسيط للغاية؛ ومع ذلك، هناك بعض الطرق لتحسينها لأحمال عمل أكبر من التجريب. راجع الأدلة التالية التي تتعمق فى التكرار عبر مجموعات البيانات الكاملة أو استخدام خطوط الأنابيب في خادم ويب:
+من الوثائق:
+* [استخدام خطوط الأنابيب على مجموعة بيانات](#using-pipelines-on-a-dataset)
+* [استخدام خطوط الأنابيب لخادم ويب](./pipeline_webserver)
+
+## المعلمات
+
+يدعم [`pipeline`] العديد من المعلمات؛ بعضها خاص بالمهمة، والبعض الآخر عام لجميع خطوط الأنابيب.
+بشكل عام، يمكنك تحديد المعلمات في أي مكان تريده:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+
+out = transcriber(...)  # سيتم استخدام هذا `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # سيتم تجاوز هذا واستخدام `my_parameter=2`.
+out = transcriber(...)  # سيتم الرجوع إلى استخدام `my_parameter=1`.
+```
+
+دعونا نلقي نظرة على 3 مهمة:
+
+### الجهاز
+
+إذا كنت تستخدم `device=n`، فإن خط الأنابيب يضع النموذج تلقائيًا على الجهاز المحدد.
+سيعمل هذا بغض النظر عما إذا كنت تستخدم PyTorch أو Tensorflow.
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+```
+
+إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `torch_dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!
+
+بدلاً من ذلك، يمكنك تعيين `device_map="auto"` لتحديد كيفية تحميل مخزنات النموذج وتخزينها تلقائيًا. يتطلب استخدام معامل `device_map` مكتبه 🤗 [Accelerate](https://huggingface.co/docs/accelerate):
+
+```bash
+pip install --upgrade accelerate
+```
+
+تقوم الشفرة التالية بتحميل مخزنات النموذج وتخزينها تلقائيًا عبر الأجهزة:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
+```
+
+لاحظ أنه إذا تم تمرير `device_map="auto"`، فلا توجد حاجة لإضافة حجة `device=device` عند إنشاء خط الأنابيب الخاص بك، فقد تواجه بعض السلوكيات غير المتوقعة!
+
+### حجم الدفعة
+
+بشكل افتراضي، لن تقوم خطوط الأنابيب بتجميع الاستدلال لأسباب مفصلة [هنا](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching). والسبب هو أن التجميع ليست أسرع بالضرورة، ويمكن أن تكون أبطأ في الواقع في بعض الحالات.
+
+ولكن إذا نجحت في حالتك الاستخدام، فيمكنك استخدام ما يلي:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
+```
+
+هذا يشغل خط الأنابيب على ملفات الصوت الأربعة المتاحة، ولكنه سيمررها على دفعتين
+إلى النموذج (الذي يوجد على وحدة معالجة الرسومات (GPU)، حيث من المرجح أن تساعد التجميع) دون الحاجة إلى أي رمز إضافي منك. 
+يجب أن تتطابق الإخراج دائمًا مع ما كنت ستحصل عليه دون التجميع. المقصود منه فقط كطريقة لمساعدتك في الحصول على سرعة أكبر من خط الأنابيب.
+
+يمكن لخطوط الأنابيب أيضًا تخفيف بعض تعقيدات التجميع لأنه، بالنسبة لبعض خطوط الأنابيب، يجب تقسيم عنصر واحد (مثل ملف صوتي طويل) إلى أجزاء متعددة لمعالجته بواسطة نموذج. يقوم خط الأنابيب بأداء هذه  العملية التي تسمى تجميع الأجزاء [*batch batching*](./main_classes/pipelines#pipeline-chunk-batching) نيابة عنك.
+
+### معلمات خاصة بالمهمة
+
+توفر جميع المهام معلمات خاصة بالمهمة تتيح المرونة والخيارات الإضافية لمساعدتك في أداء عملك.
+على سبيل المثال، تحتوي طريقة [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] على معلمة `return_timestamps` التي تبدو واعدة لترجمة مقاطع الفيديو:
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+```
+
+كما ترون، استنتج النموذج النص.وكذلك حدد **وقت** نطق الجمل المختلفة.
+
+تتوفر العديد من المعلمات لكل مهمة، لذا تحقق من مرجع API لكل مهمة لمعرفة ما يمكنك تعديله!
+على سبيل المثال، تحتوي [`~transformers.AutomaticSpeechRecognitionPipeline`] على معلمة `chunk_length_s` مفيدة 
+للعمل على ملفات الصوت الطويلة جدًا (على سبيل المثال، ترجمة الأفلام أو مقاطع الفيديو التي تستغرق ساعة) والتي لا يمكن للنموذج التعامل معها بمفرده:
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30)
+>>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav")
+{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"}
+```
+
+إذا لم تتمكن من العثور على معلمة قد تساعدك حقًا، فلا تتردد في [طلبها](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+
+
+## استخدام خطوط الأنابيب على مجموعة بيانات
+
+يمكن أيضًا تشغيل خط الأنابيب للاستدلال على مجموعة بيانات كبيرة. أسهل طريقة نوصي بها للقيام بذلك هي باستخدام المتكرر (iterator).:
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="openai-community/gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+يقوم المؤشر `data()` بإرجاع كل نتيجة، ويتعرف خط الأنابيب تلقائيًا
+المدخل قابل للتحديد ويبدأ في جلب البيانات أثناء
+يستمر في معالجتها على وحدة معالجة الرسومات (GPU) (يستخدم هذا [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) تحت الغطاء).
+هذا أمر مهم لأنك لا تحتاج إلى تخصيص ذاكرة لمجموعة البيانات بأكملها
+ويمكنك تغذية وحدة معالجة الرسومات (GPU) بأسرع ما يمكن.
+
+نظرًا لأن التجميع قد تسرع الأمور، فقد يكون من المفيد ضبط معلمة `batch_size` هنا.
+
+أبسط طريقة للتنقل خلال مجموعة بيانات هي فقط تحميل واحدة من 🤗 [Datasets](https://github.com/huggingface/datasets/):
+
+```py
+# KeyDataset هي أداة مساعدة ستقوم فقط بإخراج العنصر الذي نهتم به.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+## استخدام خطوط الأنابيب لخادم ويب
+
+<Tip>
+إن إنشاء محرك استدلال هو موضوع معقد يستحق صفحته الخاصة.
+</Tip>
+
+[Link](./pipeline_webserver)
+
+## خط أنابيب الرؤية
+
+إن استخدام [`pipeline`] لمهام الرؤية مماثل تمامًا.
+
+حدد مهمتك ومرر صورتك إلى المصنف. يمكن أن تكون الصورة رابطًا أو مسارًا محليًا أو صورة مشفرة بتنسيق base64. على سبيل المثال، ما نوع القطط الموضح أدناه؟
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## خط أنابيب النص
+
+إن استخدام [`pipeline`] لمهام NLP مماثل تمامًا.
+
+```py
+>>> from transformers import pipeline
+
+>>> # هذا النموذج هو نموذج "zero-shot-classification".
+>>> # سيصنف النص، ولكن يمكنك اختيار أي تسمية قد تتخيلها
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## خط أنابيب متعدد الوسائط
+
+تدعم [`pipeline`] أكثر من طريقة واحدة. على سبيل المثال، تجمع مهمة الإجابة على الأسئلة المرئية (VQA) بين النص والصورة. لا تتردد في استخدام أي رابط صورة تريده وسؤال تريد طرحه حول الصورة. يمكن أن تكون الصورة عنوان URL أو مسارًا محليًا للصورة.
+
+على سبيل المثال، إذا كنت تستخدم هذه [صورة الفاتورة](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png):
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> output = vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+>>> output[0]["score"] = round(output[0]["score"], 3)
+>>> output
+[{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+لتشغيل المثال أعلاه، تحتاج إلى تثبيت [`pytesseract`](https://pypi.org/project/pytesseract/) بالإضافة إلى 🤗 Transformers:
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## استخدام `pipeline` على نماذج كبيرة مع 🤗 `accelerate`:
+
+يمكنك بسهولة تشغيل `pipeline` على نماذج كبيرة باستخدام 🤗 `accelerate`! أولاً، تأكد من تثبيت `accelerate` باستخدام `pip install accelerate`.
+
+قم أولاً بتحميل نموذجك باستخدام `device_map="auto"`! سنستخدم `facebook/opt-1.3b` كمثال لنا.
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+يمكنك أيضًا تمرير نماذج محملة بـ 8 بت إذا قمت بتثبيت `bitsandbytes` وإضافة الحجة `load_in_8bit=True`
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+لاحظ أنه يمكنك استبدال نقطة التفتيش بأي نموذج من Hugging Face يدعم تحميل النماذج الكبيرة، مثل BLOOM.
+
+## إنشاء عروض توضيحية ويب من خطوط الأنابيب باستخدام `gradio`
+
+يتم دعم خطوط الأنابيب تلقائيًا في [Gradio](https://github.com/gradio-app/gradio/)، وهي مكتبة تجعل إنشاء تطبيقات تعليم الآلة الجميلة والسهلة الاستخدام على الويب أمرًا سهلاً. أولاً، تأكد من تثبيت Gradio:
+
+```
+pip install gradio
+```
+
+بعد ذلك، يمكنك إنشاء عرض توضيحي ويب حول خط أنابيب تصنيف الصور (أو أي خط أنابيب آخر) في سطر واحد من التعليمات البرمجية عن طريق استدعاء وظيفة [`Interface.from_pipeline`](https://www.gradio.app/docs/interface#interface-from-pipeline) في Gradio لإطلاق خط الأنابيب. يقوم هذا بإنشاء واجهة بديهية للسحب والإفلات في مستعرضك:
+
+```py
+from transformers import pipeline
+import gradio as gr
+
+pipe = pipeline("image-classification", model="google/vit-base-patch16-224")
+
+gr.Interface.from_pipeline(pipe).launch()
+```
+
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/panda-classification.png)
+
+بشكل افتراضي، يعمل العرض التوضيحي على خادم محلي. إذا كنت تريد مشاركتها مع الآخرين، فيمكنك إنشاء رابط عام مؤقت عن طريق تعيين `share=True` في `launch()`. يمكنك أيضًا استضافة عرضك التوضيحي على [Hugging Face Spaces](https://huggingface.co/spaces) للحصول على رابط دائم.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/pipeline_webserver.md b/docs/transformers/docs/source/ar/pipeline_webserver.md
new file mode 100644
index 0000000000000000000000000000000000000000..2a19e84d1632413557790626fd116de0ca5048f3
--- /dev/null
+++ b/docs/transformers/docs/source/ar/pipeline_webserver.md
@@ -0,0 +1,126 @@
+# استخدام قنوات المعالجة لخادم ويب 
+
+<Tip>
+
+يُعدّ إنشاء محرك استدلال أمرًا معقدًا، ويعتمد الحل "الأفضل" على مساحة مشكلتك. هل تستخدم وحدة المعالجة المركزية أم وحدة معالجة الرسومات؟ هل تريد أقل زمن وصول، أم أعلى معدل نقل، أم دعمًا للعديد من النماذج، أم مجرد تحقيق أقصى تحسين نموذج محدد؟
+توجد طرق عديدة لمعالجة هذا الموضوع، لذلك ما سنقدمه هو إعداد افتراضي جيد للبدء به قد لا يكون بالضرورة هو الحل الأمثل لك.```
+
+</Tip> 
+
+الشيء الرئيسي الذي يجب فهمه هو أننا يمكن أن نستخدم مؤشرًا، تمامًا كما تفعل [على مجموعة بيانات](pipeline_tutorial#using-pipelines-on-a-dataset)، نظرًا لأن خادم الويب هو أساسًا نظام ينتظر الطلبات ويعالجها عند استلامها. 
+
+عادةً ما تكون خوادم الويب متعددة الإرسال (متعددة مؤشرات الترابط، وغير متزامنة، إلخ) للتعامل مع الطلبات المختلفة بشكل متزامن. من ناحية أخرى، فإن قنوات المعالجة (وبشكل رئيسي النماذج الأساسية) ليست رائعة للتوازي؛ حيث تستهلك الكثير من ذاكرة الوصول العشوائي، لذا من الأفضل منحها جميع الموارد المتاحة عند تشغيلها أو إذا كانت مهمة تطلب حسابات مكثفة. 
+
+سنحل ذلك من خلال جعل خادم الويب يتعامل مع الحمل الخفيف لاستقبال الطلبات وإرسالها،وجعل مؤشر ترابط واحد يتعامل مع العمل الفعلي. سيستخدم هذا المثال `starlette`. ولكن قد تضطر إلى ضبط الكود أو تغييره إذا كنت تستخدم كودًا آخر لتحقيق التأثير نفسه. 
+
+أنشئ `server.py`: 
+
+```py
+from starlette.applications import Starlette
+from starlette.responses import JSONResponse
+from starlette.routing import Route
+from transformers import pipeline
+import asyncio
+
+
+async def homepage(request):
+    payload = await request.body()
+    string = payload.decode("utf-8")
+    response_q = asyncio.Queue()
+    await request.app.model_queue.put((string, response_q))
+    output = await response_q.get()
+    return JSONResponse(output)
+
+
+async def server_loop(q):
+    pipe = pipeline(model="google-bert/bert-base-uncased")
+    while True:
+        (string, response_q) = await q.get()
+        out = pipe(string)
+        await response_q.put(out)
+
+
+app = Starlette(
+    routes=[
+        Route("/", homepage, methods=["POST"]),
+    ],
+)
+
+
+@app.on_event("startup")
+async def startup_event():
+    q = asyncio.Queue()
+    app.model_queue = q
+    asyncio.create_task(server_loop(q))
+```
+
+الآن يمكنك تشغيله باستخدام: 
+
+```bash
+uvicorn server:app
+```
+
+ويمكنك الاستعلام عنه: 
+
+```bash
+curl -X POST -d "test [MASK]" http://localhost:8000/
+#[{"score":0.7742936015129089,"token":1012,"token_str":".","sequence":"test."},...]
+```
+
+وهكذا، لديك الآن فكرة جيدة عن كيفية إنشاء خادم ويب! 
+
+المهم حقًا هو أننا نقوم بتحميل النموذج **مرة واحدة** فقط، لذلك لا توجد نسخ من النموذج على خادم الويب. بهذه الطريقة، لا يتم استخدام ذاكرة الوصول العشوائي غير الضرورية. تسمح آلية وضع قائمة الانتظار بالقيام بأشياء متقدمة مثل تجميع بعض العناصر قبل الاستدلال لاستخدام معالجة الدفعات الديناميكية: 
+
+<Tip warning={true}>
+
+تم كتابة نموذج الكود البرمجى أدناه بشكل مقصود مثل كود وهمي للقراءة. لا تقم بتشغيله دون التحقق مما إذا كان منطقيًا لموارد النظام الخاص بك! 
+
+</Tip> 
+
+```py
+(string, rq) = await q.get()
+strings = []
+queues = []
+while True:
+    try:
+        (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms
+    except asyncio.exceptions.TimeoutError:
+        break
+    strings.append(string)
+    queues.append(rq)
+strings
+outs = pipe(strings, batch_size=len(strings))
+for rq, out in zip(queues, outs):
+    await rq.put(out)
+```
+
+مرة أخرى، تم تحسين الرمز المقترح لسهولة القراءة، وليس ليكون أفضل كود. بادئ ذي بدء، لا يوجد حد لحجم الدفعة، والذي عادةً ما لا يكون فكرة عظيمة. بعد ذلك، يتم إعادة ضبط الفترة في كل عملية جلب لقائمة الانتظار، مما يعني أنه قد يتعين عليك الانتظار لفترة أطول بكثير من 1 مللي ثانية قبل تشغيل الاستدلال (تأخير الطلب الأول بهذا القدر). 
+
+سيكون من الأفضل تحديد مهلة واحدة مدتها 1 مللي ثانية.
+
+سيظل هذا ينتظر دائمًا لمدة 1 مللي ثانية حتى إذا كانت قائمة الانتظار فارغًا، والذي قد لا يكون الأفضل نظرًا لأنك تريد على الأرجح البدء في إجراء الاستدلال إذا لم يكن هناك شيء في قائمة الانتظا. ولكن ربما يكون منطقيًا إذا كانت المعالجة الديناميكية للدفعات مهمة حقًا لحالة الاستخدام لديك. مرة أخرى، لا يوجد حل واحد هو الأفضل. 
+
+## بعض الأشياء التي قد ترغب في مراعاتها 
+
+### التحقق من الأخطاء 
+
+هناك الكثير مما قد يحدث بشكل خاطئ في عند اتاحة النموذج للجمهور: نفاد الذاكرة، أو نفاد المساحة، أو فشل تحميل النموذج، أو قد يكون الاستعلام خاطئًا، أو قد يكون الاستعلام صحيحًا ولكن لا يزال يفشل في التشغيل بسبب خطأ في إعداد النموذج، وما إلى ذلك.
+
+بشكل عام، من الجيد أن يُخرِج الخادم الأخطاء للمستخدم، لذلك يُعدّ إضافة الكثير من عبارات `try..except` لعرض هذه الأخطاء فكرة
+جيدة. لكن ضع في اعتبارك أنه قد يمثل أيضًا مخاطرة أمنية الكشف عن جميع تلك الأخطاء اعتمادًا على سياق الأمان لديك.
+
+### قطع الدائرة (Circuit breaking)
+
+عادةً ما تبدو خوادم الويب أفضل عندما تقوم بقطع الدائرة. وهذا يعني أنها ترجع أخطاء صحيحة عندما تكون مثقلة بشكل زائد بدلاً من الانتظار إلى أجل غير مسمى. قم بإرجاع خطأ 503 بدلاً من الانتظار لفترة طويلة جدًا أو 504 بعد فترة طويلة. 
+
+من السهل نسبيًا تنفيذ ذلك في الكود المقترح نظرًا لوجود قائمة انتظار واحد. إن النظر في حجم قائمة الانتظار هو طريقة أساسية لبدء إرجاع الأخطاء قبل فشل خادم الويب بسبب الحمل الزائد. 
+
+### حجب عمل خيط التنفيذ الرئيسي (Main thread)
+
+حاليًا، لا تدعم PyTorch  العمليات غير المتزامنة، وسيؤدي الحساب إلى حجب عمل الخيط الرئيسي أثناء تشغيله. وهذا يعني أنه سيكون من الأفضل إذا تم إجبار PyTorch على أن تعمل على الخيط/العملية الخاصة به. لم يتم ذلك هنا لأن الكود أكثر تعقيدًا (في الغالب لأن خيوط التنفيذ والعمليات غير المتزامنة  وقوائم الانتظار  لا تتوافق معًا). ولكن في النهاية، فإنه سيؤدي نفس الوظيفة. 
+
+سيكون هذا مهمًا إذا كان الاستدلال للعناصر الفردية طويلاً (> 1 ثانية) لأنه في هذه الحالة، فهذا يعني أنه سيتعين أثناء الاستدلال على كل استعلام الانتظار لمدة ثانية واحدة قبل حتى يلقي خطأ.
+
+### المعالجة الديناميكية 
+
+بشكل عام، لا تُعدّ المعالجة بالضرورة تحسينًا مقارنةً بتمرير عنصر واحد في كل مرة (راجع [تفاصيل المعالجة بالدفعات](./main_classes/pipelines#pipeline-batching) لمزيد من المعلومات).  ولكن يمكن أن تكون فعالة للغاية عند استخدامها  بالإعداد الصحيح. في واجهة برمجة التطبيقات، لا توجد معالجة ديناميكية بشكل افتراضي (فرصة كبيرة جدًا للتباطؤ).  ولكن بالنسبة لاستدلال BLOOM - وهو نموذج كبير جدًا - تُعدّ المعالجة الديناميكية **ضرورية** لتوفير تجربة جيدة للجميع.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/preprocessing.md b/docs/transformers/docs/source/ar/preprocessing.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c1f68934d2052a5baf6d30488be968725a5be9b
--- /dev/null
+++ b/docs/transformers/docs/source/ar/preprocessing.md
@@ -0,0 +1,521 @@
+# المعالجة المسبقة Preprocessing
+
+[[open-in-colab]]
+
+قبل تدريب نموذج على مجموعة بيانات، يجب معالجتها مسبقًا وفقًا تنسيق  المتوقع لمدخلات النموذج. سواء كانت بياناتك نصية أو صورًا أو صوتًا، فيجب تحويلها وتجميعها في دفعات من الموترات. يوفر 🤗 Transformers مجموعة من فئات المعالجة المسبقة للمساعدة في إعداد بياناتك للنموذج. في هذا البرنامج التعليمي، ستتعلم أنه بالنسبة لـ:
+
+* للنص، استخدم [مُجزّئ الرموز](./main_classes/tokenizer) لتحويل النص إلى تسلسل من الرموز، وإنشاء تمثيل رقمي للرموز، وتجميعها في موترات(tensors).
+* للكلام والصوت، استخدم [مستخرج الميزات](./main_classes/feature_extractor) لاستخراج ميزات متسلسلة من أشكال موجات الصوت وتحويلها إلى موترات.
+* تستخدم مدخلات الصورة [ImageProcessor](./main_classes/image_processor) لتحويل الصور إلى موترات.
+* تستخدم مدخلات متعددة الوسائط [معالجًا](./main_classes/processors) لدمج مُجزّئ الرموز ومستخرج الميزات أو معالج الصور.
+
+<Tip>
+
+`AutoProcessor` **يعمل دائمًا** ويختار تلقائيًا الفئة الصحيحة للنموذج الذي تستخدمه، سواء كنت تستخدم مُجزّئ رموز أو معالج صور أو مستخرج ميزات أو معالجًا.
+
+</Tip>
+
+قبل البدء، قم بتثبيت 🤗 Datasets حتى تتمكن من تحميل بعض مجموعات البيانات لتجربتها:
+
+```bash
+pip install datasets
+```
+
+## معالجة اللغة الطبيعية (Natural Language Processing (NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+أداة المعالجة المسبقة الرئيسية للبيانات النصية هي [مُجزّئ اللغوي](main_classes/tokenizer). يقوم مُجزّئ اللغوي بتقسيم النص إلى  "أجزاء لغوية" (tokens) وفقًا لمجموعة من القواعد. يتم تحويل الأجزاء اللغوية إلى أرقام ثم إلى منسوجات، والتي تصبح مدخلات للنموذج. يقوم المجزئ اللغوي بإضافة أي مدخلات إضافية يحتاجها النموذج.
+
+<Tip>
+
+إذا كنت تخطط لاستخدام نموذج مُدرب مسبقًا، فمن المهم استخدامالمجزئ اللغوي المقترن بنفس ذلك النموذج. يضمن ذلك تقسيم النص بنفس الطريقة التي تم بها تقسيم النصوص ما قبل التدريب، واستخدام نفس  القاموس الذي يربط بين الأجزاء اللغوية وأرقامها ( يُشار إليها عادةً باسم المفردات *vocab*) أثناء التدريب المسبق.
+
+</Tip>
+
+ابدأ بتحميل  المُجزّئ اللغوي مُدرب مسبقًا باستخدام طريقة [`AutoTokenizer.from_pretrained`]. يقوم هذا بتنزيل المفردات *vocab* الذي تم تدريب النموذج عليه:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+```
+
+ثم مرر نصك إلى المُجزّئ اللغوي:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+يعيد المُجزّئ اللغوي قاموسًا يحتوي على ثلاثة عناصر مهمة:
+
+* [input_ids](glossary#input-ids) هي الفهارس المقابلة لكل رمز في الجملة.
+* [attention_mask](glossary#attention-mask) يشير إلى ما إذا كان يجب الانتباه بالرمز أم لا.
+* [token_type_ids](glossary#token-type-ids) يحدد التسلسل الذي ينتمي إليه الرمز عندما يكون هناك أكثر من تسلسل واحد.
+
+أعد إدخالك الأصلي عن طريق فك ترميز `input_ids`:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+كما ترى، أضاف المُجزّئ اللغوي رمزين خاصين - `CLS` و`SEP` (مصنف وفاصل) - إلى الجملة. لا تحتاج جميع النماذج إلى
+رموز خاصة، ولكن إذا فعلوا ذلك، فإن المُجزّئ اللغوي يضيفها تلقائيًا لك.
+
+إذا كان هناك عدة جمل تريد معالجتها مسبقًا، فقم بتمريرها كقائمة إلى مُجزّئ اللغوي:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### الحشو Padding
+
+لا تكون الجمل دائمًا بنفس الطول،  وهذا يمكن أن يمثل مشكلة لأن الموترات،وهي مدخلات النموذج، تحتاج إلى شكل موحد. الحشو هو استراتيجية لضمان أن تكون الموترات مستطيلة عن طريق إضافة رمز حشو *padding* خاص إلى الجمل الأقصر.
+
+قم بتعيين معلمة الحشو `padding` إلى `True` لحشو التسلسلات الأقصر في الدفعة لتطابق أطول تسلسل:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0، 0، 0، 0، 0، 0، 0، 0]]}
+```
+
+تم الآن حشو الجملتين الأولى والثالثة بـ `0` لأنهما أقصر.
+
+### البتر Truncation
+
+وعلى صعيد أخر، قد يكون التسلسل طويلًا جدًا بالنسبة للنموذج للتعامل معه. في هذه الحالة، ستحتاج إلى بتر التسلسل إلى طول أقصر.
+
+قم بتعيين معلمة `truncation` إلى `True` لتقليم تسلسل إلى الطول الأقصى الذي يقبله النموذج:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0، 0، 0، 0، 0]]،
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0، 0، 0، 0],
+                    [1, 1, 1, 1, 1, 1, 1، 1، 1، 1، 1، 1، 1، 1، 1، 1],
+                    [1، 1، 1، 1، 1، 1، 1، 0، 0، 0، 0، 0، 0، 0، 0، 0]]}
+```
+
+<Tip>
+
+تحقق من دليل المفاهيم [Padding and truncation](./pad_truncation) لمعرفة المزيد حول معامﻻت الحشو و البتر المختلفة.
+
+</Tip>
+
+### بناء الموترات Build tensors
+
+أخيرًا، تريد أن يقوم  المجزئ اللغوي بإرجاع موترات (tensors) الفعلية التي ستُغذي النموذج.
+
+قم بتعيين معلمة `return_tensors` إلى إما `pt` لـ PyTorch، أو `tf` لـ TensorFlow:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+ 
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>,
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+تدعم خطوط الأنابيب المختلفة معامل مُجزِّئ الرموز(tokenizer) بشكل مختلف في طريقة `()__call__` الخاصة بها.
+و خطوط الأنابيب `text-2-text-generation` تدعم فقط `truncation`.
+و خطوط الأنابيب `text-generation` تدعم `max_length` و`truncation` و`padding` و`add_special_tokens`.
+أما في خطوط الأنابيب `fill-mask`، يمكن تمرير معامل مُجزِّئ الرموز (tokenizer) في المتغير `tokenizer_kwargs` (قاموس).
+
+</Tip>
+
+## الصوت Audio
+
+بالنسبة للمهام الصوتية، ستحتاج إلى [مستخرج الميزات](main_classes/feature_extractor) لإعداد مجموعة البيانات الخاصة بك للنماذج. تم تصميم مستخرج الميزات لاستخراج الميزات من بيانات الصوت الخام، وتحويلها إلى موتورات.
+
+قم بتحميل مجموعة بيانات [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (راجع البرنامج التعليمي لـ 🤗 [Datasets](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيفية استخدام مستخرج الميزات مع مجموعات البيانات الصوتية:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+الوصول إلى العنصر الأول من عمود `audio` لمعرفة المدخلات. يؤدي استدعاء عمود `audio` إلى تحميل ملف الصوت وإعادة أخذ العينات تلقائيًا:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+يعيد هذا ثلاثة عناصر:
+
+* `array` هو إشارة الكلام المحملة - وإعادة أخذ العينات المحتملة - كصفيف 1D.
+* `path` يشير إلى موقع ملف الصوت.
+* `sampling_rate` يشير إلى عدد نقاط البيانات في إشارة الكلام المقاسة في الثانية.
+
+بالنسبة لهذا البرنامج التعليمي، ستستخدم نموذج [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base). الق نظرة على بطاقة النموذج، وستتعلم أن Wav2Vec2 مُدرب مسبقًا على صوت الكلام الذي تم أخذ عينات منه بمعدل 16 كيلو هرتز. من المهم أن يتطابق معدل أخذ العينات لبيانات الصوت مع معدل أخذ العينات لمجموعة البيانات المستخدمة لتدريب النموذج مسبقًا. إذا لم يكن معدل أخذ العينات لبياناتك هو نفسه، فيجب إعادة أخذ العينات من بياناتك.
+
+1. استخدم طريقة [`~datasets.Dataset.cast_column`] في 🤗 Datasets لإعادة أخذ العينات بمعدل أخذ العينات 16 كيلو هرتز:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. استدعاء عمود `audio` مرة أخرى لأخذ عينات من ملف الصوت:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+بعد ذلك، قم بتحميل مستخرج الميزات لتطبيع وحشو المدخلات. عند إضافة حشو للبيانات النصية، تتم إضافة "0" للتسلسلات الأقصر. تنطبق نفس الفكرة على بيانات الصوت. يضيف مستخرج الميزات "0" - الذي يتم تفسيره على أنه صمت - إلى "array".
+
+قم بتحميل مستخرج الميزات باستخدام [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+مرر صفيف الصوت إلى مستخرج الميزات. كما نوصي بإضافة معامل `sampling_rate` في مستخرج الميزات من أجل تصحيح الأخطاء الصامتة التي قد تحدث بشكل أفضل.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+تمامًا مثل مُجزِّئ الرموز، يمكنك تطبيق الحشو أو البتر للتعامل مع التسلسلات المتغيرة في دفعة. الق نظرة على طول التسلسل لهاتين العينتين الصوتيتين:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+قم بإنشاء دالة لمعالجة مجموعة البيانات بحيث يكون للنماذج الصوتية نفس الأطوال. حدد أقصى طول للعينة ، وسيقوم مستخرج الميزات إما بإضافة حشو أو بتر التسلسلات لمطابقتها:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+قم بتطبيق `preprocess_function` على أول بضع أمثلة في مجموعة البيانات:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+أطوال العينات الآن متساوية وتطابق الطول الأقصى المحدد. يمكنك الآن تمرير مجموعة البيانات المعالجة إلى النموذج!
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## رؤية الكمبيوتر Computer vision
+
+بالنسبة لمهام رؤية الحاسوبية، ستحتاج إلى معالج صور [image processor](main_classes/image_processor) لإعداد مجموعة البيانات الخاصة بك لتناسب النموذج. تتكون معالجة الصور المسبقة من عدة خطوات لتحويل الصور إلى الشكل الذي يتوقعه النموذج. وتشمل هذه الخطوات، على سبيل المثال لا الحصر، تغيير الحجم والتطبيع وتصحيح قناة الألوان وتحويل الصور إلى موترات(tensors).
+
+<Tip>
+
+عادة ما تتبع معالجة الصور المسبقة شكلاً من أشكال زيادة البيانات (التضخيم). كلا العمليتين،  معالجة الصور المسبقة وزيادة الصور تغيران بيانات الصورة، ولكنها تخدم أغراضًا مختلفة:
+
+*زيادة البيانات: تغيير الصور عن طريق زيادة الصور بطريقة يمكن أن تساعد في منع الإفراط في التعميم وزيادة متانة النموذج. يمكنك أن تكون مبدعًا في كيفية زيادة بياناتك - ضبط السطوع والألوان، واالقص، والدوران، تغيير الحجم، التكبير، إلخ. ومع ذلك، كن حذرًا من عدم تغيير معنى الصور بزياداتك.
+*معالجة الصور المسبقة: تضمن معالجة الصور اتتطابق الصور مع تنسيق الإدخال المتوقع للنموذج. عند ضبط نموذج رؤية حاسوبية بدقة، يجب معالجة الصور بالضبط كما كانت عند تدريب النموذج في البداية.
+
+يمكنك استخدام أي مكتبة تريدها لزيادة بيانات الصور. لمعالجة الصور المسبقة، استخدم `ImageProcessor` المرتبط بالنموذج.
+
+</Tip>
+
+قم بتحميل مجموعة بيانات [food101](https://huggingface.co/datasets/food101) (راجع دليل 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيف يمكنك استخدام معالج الصور مع مجموعات بيانات رؤية الحاسب:
+
+<Tip>
+
+استخدم معامل `split` من 🤗 Datasets لتحميل عينة صغيرة فقط من مجموعة التدريب نظرًا لحجم البيانات كبيرة جدًا!
+
+</Tip>
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+بعد ذلك، الق نظرة على الصورة مع ميزة 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image):
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+قم بتحميل معالج الصور باستخدام [`AutoImageProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+أولاً، دعنا نضيف بعض الزيادات إلى الصور. يمكنك استخدام أي مكتبة تفضلها، ولكن في هذا الدليل، سنستخدم وحدة [`transforms`](https://pytorch.org/vision/stable/transforms.html) من torchvision. إذا كنت مهتمًا باستخدام مكتبة زيادة بيانات أخرى، فتعرف على كيفية القيام بذلك في [دفاتر Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) أو [دفاتر Kornia](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb).
+
+1. هنا نستخدم [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html) لربط بعض التحولات معًا - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) و [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html).
+لاحظ بالنسبة لتغيير الحجم، يمكننا الحصول على متطلبات حجم الصورة من `image_processor`. بالنسبة لبعض النماذج، يُتوقع ارتفاع وعرض دقيقين، بينما بالنسبة للنماذج الأخرى، يتم تحديد  الحافة الأقصر`shortest_edge` فقط.
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. يقبل النموذج [`pixel_values`](model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderModel.forward.pixel_values)
+كإدخال له. يمكن لـ `ImageProcessor` التعامل مع تطبيع الصور، وتوليد موترات(tensors) مناسبة.
+قم بإنشاء دالة تجمع بين تضخيم بيانات الصور ومعالجة الصور المسبقة لمجموعة من الصور وتوليد `pixel_values`:
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+في المثال أعلاه، قمنا بتعيين `do_resize=False` لأننا قمنا بالفعل بتغيير حجم الصور في تحويل زيادة الصور،
+واستفدنا من خاصية `size` من `image_processor` المناسب. إذا لم تقم بتغيير حجم الصور أثناء زيادة الصور،
+فاترك هذا المعلمة. بشكل افتراضي، ستتعامل `ImageProcessor` مع تغيير الحجم.
+
+إذا كنت ترغب في تطبيع الصور كجزء من تحويل زيادة الصور، فاستخدم قيم `image_processor.image_mean`،
+و `image_processor.image_std`.
+</Tip>
+
+3. ثم استخدم 🤗 Datasets[`~datasets.Dataset.set_transform`] لتطبيق التحولات أثناء التنقل:
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. الآن عند الوصول إلى الصورة، ستلاحظ أن معالج الصور قد أضاف `pixel_values`. يمكنك تمرير مجموعة البيانات المعالجة إلى النموذج الآن!
+
+```py
+>>> dataset[0].keys()
+```
+
+هكذا تبدو الصورة بعد تطبيق التحولات. تم اقتصاص الصورة بشكل عشوائي وتختلف خصائص الألوان بها.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+بالنسبة للمهام مثل الكشف عن الأشياء، والتجزئة الدلالية، والتجزئة المثالية، والتجزئة الشاملة، يوفر `ImageProcessor`
+تقوم هذه الطرق بتحويل النواتج الأولية للنموذج إلى تنبؤات ذات معنى مثل مربعات الحدود،
+أو خرائط التجزئة.
+
+</Tip>
+
+### الحشو Pad
+
+في بعض الحالات، على سبيل المثال، عند ضبط نموذج [DETR](./model_doc/detr) بدقة، يقوم النموذج بتطبيق زيادة المقياس أثناء التدريب. قد يتسبب ذلك في اختلاف أحجام الصور في دفعة واحدة. يمكنك استخدام [`DetrImageProcessor.pad`]
+من [`DetrImageProcessor`] وتحديد دالة `collate_fn` مخصصة لتجميع الصور معًا.
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## متعدد الوسائط Mulimodal
+
+بالنسبة للمهام التي تتطلب مدخلات متعددة الوسائط، ستحتاج إلى معالج [processor](main_classes/processors) لإعداد مجموعة البيانات الخاصة بك لتناسب النموذج. يقترن المعالج بين  بمعالجين آخرين مثل محول النص إلى رمز ومستخرج الميزات.
+
+قم بتحميل مجموعة بيانات [LJ Speech](https://huggingface.co/datasets/lj_speech) (راجع دليل 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) لمزيد من التفاصيل حول كيفية تحميل مجموعة بيانات) لمعرفة كيف يمكنك استخدام معالج للتعرف التلقائي على الكلام (ASR):
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+بالنسبة لـ ASR، فأنت تركز بشكل أساسي على `audio` و `text` لذا يمكنك إزالة الأعمدة الأخرى:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+الآن الق نظرة على أعمدة `audio` و `text`:
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+الآن الق نظرة على أعمدة `audio` و `text`:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+تذكر أنه يجب عليك دائمًا [إعادة أخذ العينات](preprocessing#audio) لمعدل أخذ العينات في مجموعة البيانات الصوتية الخاصة بك لمطابقة معدل أخذ العينات في مجموعة البيانات المستخدمة لتدريب النموذج مسبقًا!
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+قم بتحميل معالج باستخدام [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. قم بإنشاء دالة لمعالجة بيانات الصوت الموجودة في `array` إلى `input_values`، ورموز `text` إلى `labels`. هذه هي المدخلات للنموذج:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. قم بتطبيق دالة `prepare_dataset` على عينة:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+لقد أضاف المعالج الآن `input_values` و `labels`، وتم أيضًا إعادة أخذ العينات لمعدل أخذ العينات بشكل صحيح إلى 16 كيلو هرتز. يمكنك تمرير مجموعة البيانات المعالجة إلى النموذج الآن!
diff --git a/docs/transformers/docs/source/ar/quicktour.md b/docs/transformers/docs/source/ar/quicktour.md
new file mode 100644
index 0000000000000000000000000000000000000000..1795c3a5d74fcc36ce97b05fcf46b004ebc1202f
--- /dev/null
+++ b/docs/transformers/docs/source/ar/quicktour.md
@@ -0,0 +1,543 @@
+# جولة سريعة
+
+[[open-in-colab]]
+
+ابدأ رحلتك مع مكتبة 🤗 Transformers! سواء كنت مطورًا أو مستخدمًا عاديًا، ستساعدك هذه الجولة السريعة على البدء وستُظهر لك كيفية استخدام [`pipeline`] للاستنتاج، وتحميل نموذج مُدرب مسبقًا ومعالج مُسبق مع [AutoClass](./model_doc/auto)، وتدريب نموذج بسرعة باستخدام PyTorch أو TensorFlow. إذا كنت مبتدئًا، نوصي بالاطلاع على دروسنا أو [الدورة](https://huggingface.co/course/chapter1/1) للحصول على شرح أكثر تعمقًا للمفاهيم المقدمة هنا.
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+!pip install transformers datasets evaluate accelerate
+```
+
+ستحتاج أيضًا إلى تثبيت إطار عمل التعلم الآلي المفضل لديك:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+## خط الأنابيب
+
+<Youtube id="tiZFewofSLM"/>
+
+يمثل [`pipeline`] أسهل وأسرع طريقة لاستخدام نموذج مُدرب مسبقًا للاستنتاج. يمكنك استخدام [`pipeline`] جاهزًا للعديد من المهام عبر طرق مختلفة، والتي يظهر بعضها في الجدول أدناه:
+
+<Tip>
+
+للاطلاع على القائمة الكاملة للمهام المتاحة، راجع [مرجع واجهة برمجة التطبيقات الخاصة بخط الأنابيب](./main_classes/pipelines).
+
+</Tip>
+
+<div dir="rtl">
+
+| **المهمة**                     | **الوصف**                                                                                              | **الطريقة**    | **معرف خط الأنابيب**                       |
+|------------------------------|--------------------------------------------------------------------------------------------------------------|-----------------|-----------------------------------------------|
+| تصنيف النص          | تعيين تسمية إلى تسلسل نص معين                                                                   | NLP             | pipeline(task=“sentiment-analysis”)           |
+| توليد النص              | توليد نص بناءً على موجه معين                                                                                 | NLP             | pipeline(task=“text-generation”)              |
+| تلخيص                | توليد ملخص لتسلسل نص أو مستند                                                         | NLP             | pipeline(task=“summarization”)                |
+| تصنيف الصور         | تعيين تسمية لصورة معينة                                                                                   | رؤية حاسوبية | pipeline(task=“image-classification”)         |
+| تجزئة الصورة           | تعيين تسمية لكل بكسل فردي في الصورة (يدعم التجزئة الدلالية، والمجملة، وتجزئة مثيلات) | رؤية حاسوبية | pipeline(task=“image-segmentation”)           |
+| اكتشاف الأشياء             | التنبؤ بحدود الأشياء وفئاتها في صورة معينة                                                | رؤية حاسوبية | pipeline(task=“object-detection”)             |
+| تصنيف الصوت         | تعيين تسمية لبيانات صوتية معينة                                                                            | صوتي           | pipeline(task=“audio-classification”)         |
+| التعرف على الكلام التلقائي | نسخ الكلام إلى نص                                                                                  | صوتي           | pipeline(task=“automatic-speech-recognition”) |
+| الإجابة على الأسئلة البصرية    | الإجابة على سؤال حول الصورة، مع إعطاء صورة وسؤال                                             | متعدد الوسائط      | pipeline(task=“vqa”)                          |
+| الإجابة على أسئلة المستندات  | الإجابة على سؤال حول المستند، مع إعطاء مستند وسؤال                                        | متعدد الوسائط      | pipeline(task="document-question-answering")  |
+| كتابة تعليق على الصورة             | إنشاء تعليق على صورة معينة                                                                         | متعدد الوسائط      | pipeline(task="image-to-text")                |
+
+</div>
+ابدأ بإنشاء مثيل من [`pipeline`] وتحديد المهمة التي تريد استخدامه لها. في هذا الدليل، ستستخدم خط الأنابيب للتحليل النصي كنموذج:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+يقوم [`pipeline`] بتنزيل وتخزين نسخة احتياطية من نموذج افتراضي [مُدرب مسبقًا](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) ومعالج للتحليل النصي. الآن يمكنك استخدام `classifier` على النص المستهدف:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+إذا كان لديك أكثر من إدخال واحد، قم بتمرير إدخالاتك كقائمة إلى [`pipeline`] لإرجاع قائمة من القواميس:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+يمكن لخط الأنابيب أيضًا أن يتنقل خلال مجموعة بيانات كاملة لأي مهمة تريدها. كمثال على ذلك، دعنا نختار التعرف على الكلام التلقائي كمهمة لنا:
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+قم بتحميل مجموعة بيانات صوتية (راجع دليل البدء السريع لـ 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart#audio) للحصول على مزيد من التفاصيل) التي تريد التنقل خلالها. على سبيل المثال، قم بتحميل مجموعة بيانات [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+يجب التأكد من أن نفس الجودة الصوتية (معدل أخذ العينات) لمجموعة البيانات يتطابق مع معدل أخذ العينات الذي تم تدريب [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) عليه:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+يتم تحميل الملفات الصوتية وإعادة تشكيلها تلقائيًا عند استدعاء العمود "audio".
+استخرج المصفوفات الموجية الخام من أول 4 عينات ومررها كقائمة إلى خط الأنابيب:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
+```
+
+بالنسبة لمجموعات البيانات الكبيرة  التي تحتوي على مدخلات ضخمة (كما هو الحال في البيانات الصوتية أو المرئية)، يفضل تمرير مولد (generator) بدلاً من قائمة لتحميل جميع المدخلات في الذاكرة دفعة واحدة. راجع [مرجع واجهة برمجة التطبيقات الخاصة بخط الأنابيب](./main_classes/pipelines) للحصول على مزيد من المعلومات.
+
+### ااستخدم نموذجًا ومجزئًا آخرين في خط الأنابيب
+
+يمكن لخط الأنابيب [`pipeline`] استيعاب أي نموذج من [Hub](https://huggingface.co/models)، مما يسهل التكيف مع حالات الاستخدام الأخرى. على سبيل المثال، إذا كنت تريد نموذجًا قادرًا على التعامل مع النص الفرنسي، فاستخدم العلامات على Hub لفلتره نموذج مناسب. تعيد النتيجة الأولى المرشحة نموذج BERT متعدد اللغات [BERT model](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment) الذي تم ضبطه مسبقًا للتحليل المشاعر والذي يمكنك استخدامه للنص الفرنسي:
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+استخدم [`AutoModelForSequenceClassification`] و [`AutoTokenizer`] لتحميل النموذج المُدرب مسبقًا ومعالجته المرتبط به (مزيد من المعلومات حول `AutoClass` في القسم التالي):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+استخدم [`TFAutoModelForSequenceClassification`] و [`AutoTokenizer`] لتحميل النموذج المُدرب مسبقًا ومعالجته المرتبط به (مزيد من المعلومات حول `TFAutoClass` في القسم التالي):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+حدد النموذج والمعالج في [`pipeline`]. الآن يمكنك تطبيق `classifier` على النص الفرنسي:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+إذا لم تجد نموذجًا جاهزًا يناسب مهمتك، فستحتاج إلى ضبط نموذج مُدرب مسبقًا على بياناتك. اطلع على [دليل الضبط الدقيق](./training) للتعرف على كيفية القيام بذلك. وبعد ضبط نموذجك المُدرب مسبقًا، يرجى مراعاة [المشاركة](./model_sharing) النموذج مع المجتمع على Hub لمساعدة الجميع في مجال التعلم الآلي! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+في الخلفية، تعمل فئتا [`AutoModelForSequenceClassification`] و [`AutoTokenizer`] معًا لتشغيل دالة pipeline()  الذي استخدمتها أعلاه. تعتبر [AutoClass](./model_doc/auto) اختصارًا يقوم تلقائيًا باسترداد بنية نموذج مُدرب مسبقًا من اسمه أو مساره. كل ما عليك فعله هو تحديد فئة `AutoClass` المناسبة لمهمتك وفئة المعالجة المرتبطة بها.
+
+لنعد إلى المثال من القسم السابق ولنرى كيف يمكنك استخدام `AutoClass` لتكرار نتائج خط الأنابيب.
+
+### المجزئ التلقائي (AutoTokenizer)
+
+يتولى المجزئ مسؤولية تحويل النص إلى مصفوفة من الأرقام (رموز) يمكن للنموذج فهمها ومعالجتها. هناك قواعد متعددة تحكم عملية التجزئة، بما في ذلك كيفية تقسيم كلمة وما هو المستوى الذي يجب أن تقسيم الكلمات عنده (تعرف على المزيد حول المعالجة في [ملخص المجزئ](./tokenizer_summary)). أهم شيء يجب تذكره هو أنك تحتاج إلى إنشاء مثيل للمجزئ بنفس اسم النموذج لضمان استخدامك لقواعد التجزئة نفسها التي تم تدريب النموذج عليها.
+
+قم بتحميل المجزئ باستخدام [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+مرر نصك إلى المجزئ:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+يعيد المجزئ قاموسًا يحتوي على:
+
+* [input_ids](./glossary#input-ids): التمثيلات الرقمية لرموزك.
+* [attention_mask](./glossary#attention-mask): تشير إلى الرموز التي يجب الانتباه بها.
+
+يمكن المجزئ أيضًا قبول قائمة من المدخلات، ويقوم بـ "حشو" و"تقصير" النص لإرجاع كدفعة بطول موحد:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+اطلع على [الدليل التمهيدي للمعالجة المسبقة](./preprocessing) للحصول على مزيد من التفاصيل حول المعالجة، وكيفية استخدام [`AutoImageProcessor`] و [`AutoFeatureExtractor`] و [`AutoProcessor`] لمعالجة الصور والصوت والإدخالات متعددة الوسائط.
+
+</Tip>
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+تقدم مكتبة 🤗 Transformers طريقة بسيطة وموحدة لتحميل نماذج مدربة مسبقًا. وهذا يعني أنه يمكنك تحميل [`AutoModel`] كما لو كنت تقوم بتحميل [`AutoTokenizer`]. الفرق الوحيد هو اختيار فئة [`AutoModel`] المناسبة للمهمة. بالنسبة لتصنيف النص (أو التسلسل)، يجب عليك تحميل [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+راجع [ملخص المهمة](./task_summary) للاطلاع على المهام التي تدعمها فئة [`AutoModel`].
+
+</Tip>
+
+الآن قم بتمرير دفعة المدخلات المُعالجة مسبقًا مباشرة إلى النموذج. عليك فقط فك تعبئة القاموس عن طريق إضافة `**`:
+
+# تدريب النموذج
+
+الآن، مرر دفعة المدخلات المعالجة مسبقًا مباشرة إلى النموذج. ما عليك سوى فك تعبئة القاموس عن طريق إضافة `**`:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+يُخرج النموذج التنشيطات النهائية في سمة `logits`. طبق دالة softmax على `logits` للحصول على الاحتمالات:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+يوفر 🤗 Transformers طريقة بسيطة وموحدة لتحميل مثيلات مُدربة مسبقًا. وهذا يعني أنه يمكنك تحميل [`TFAutoModel`] مثل تحميل [`AutoTokenizer`]. والفرق الوحيد هو تحديد [`TFAutoModel`] الصحيح للمهمة. للتصنيف النصي (أو التسلسلي)، يجب تحميل [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+راجع [ملخص المهام](./task_summary) للمهام المدعومة بواسطة فئة [`AutoModel`].
+
+</Tip>
+
+الآن، مرر دفعة المدخلات المعالجة مسبقًا مباشرة إلى النموذج. يمكنك تمرير المصفوفات كما هي:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+يقوم النموذج بإخراج التنشيطات النهائية في سمة `logits`. طبق دالة softmax على `logits` لاسترداد الاحتمالات:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+تخرج جميع نماذج 🤗 Transformers (PyTorch أو TensorFlow) المصفوفات *قبل* دالة التنشيط النهائية (مثل softmax) لأن دالة التنشيط النهائية غالبًا ما تكون مدمجة مع دالة الخسارة. نواتج النموذج عبارة عن فئات بيانات خاصة، لذلك يتم استكمال سماتها تلقائيًا في IDE. وتتصرف مخرجات النموذج مثل زوج مرتب أو قاموس (يمكنك الفهرسة باستخدام عدد صحيح ، شريحة، أو سلسلة)، وفي هذه الحالة، يتم تجاهل السمات التي تساوي None.
+
+</Tip>
+
+### حفظ النموذج
+
+<frameworkcontent>
+<pt>
+بمجرد ضبط نموذجك، يمكنك حفظه مع برنامج الترميز الخاص به باستخدام [`PreTrainedModel.save_pretrained`]:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+عندما تكون مستعدًا لاستخدام النموذج مرة أخرى، أعد تحميله باستخدام [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+بمجرد ضبط نموذجك، يمكنك حفظه مع برنامج الترميز الخاص به باستخدام [`TFPreTrainedModel.save_pretrained`]:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+عندما تكون مستعدًا لاستخدام النموذج مرة أخرى، أعد تحميله باستخدام [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+من الميزات الرائعة في 🤗 Transformers القدرة على حفظ نموذج وإعادة تحميله كنموذج PyTorch أو TensorFlow. يمكن أن يحول معامل `from_pt` أو `from_tf` النموذج من إطار عمل إلى آخر:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</tf>
+</frameworkcontent>
+
+
+## إنشاء نماذج مخصصة
+
+يمكنك تعديل فئة تكوين النموذج لتغيير كيفية بناء النموذج. يحدد التكوين سمات النموذج، مثل عدد الطبقات المخفية أو رؤوس الاهتمام. تبدأ من الصفر عند تهيئة نموذج من فئة تكوين مخصصة. يتم تهيئة سمات النموذج بشكل عشوائي، ويجب تدريب النموذج قبل استخدامه للحصول على نتائج ذات معنى.
+
+ابدأ باستيراد [`AutoConfig`]. ثم قم بتحميل النموذج المُدرب مسبقًا الذي تريد تعديله. ضمن [`AutoConfig.from_pretrained`]. يمكنك تحديد السمة التي تريد تغييرها، مثل عدد رؤوس الاهتمام:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+قم بإنشاء نموذج من تكوينك المخصص باستخدام [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+قم بإنشاء نموذج من تكوينك المخصص باستخدام [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+الق نظرة على دليل [إنشاء بنية مخصصة](./create_a_model) لمزيد من المعلومات حول بناء التكوينات المخصصة.
+
+## المدرب - حلقة تدريب محسنة لـ PyTorch
+
+جميع النماذج عبارة عن [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) قياسية، لذا يمكنك استخدامها في أي حلقة تدريب نموذجية. في حين يمكنك كتابة حلقة التدريب الخاصة بك، يوفر 🤗 Transformers فئة [`Trainer`] لـ PyTorch، والتي تحتوي على حلقة التدريب الأساسية وتضيف وظائف إضافية لميزات مثل التدريب الموزع، والدقة المختلطة، والمزيد.
+
+وفقًا لمهمتك، ستقوم عادةً بتمرير المعلمات التالية إلى [`Trainer`]:
+
+1. ستبدأ بـ [`PreTrainedModel`] أو [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module):
+
+   ```py
+   >>> from transformers import AutoModelForSequenceClassification
+
+   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+2. تحتوي [`TrainingArguments`] على فرط معلمات النموذج التي يمكنك تغييرها مثل معدل التعلم، وحجم الدفعة، وعدد العصور التي يجب التدريب عليها. يتم استخدام القيم الافتراضية إذا لم تحدد أي حجج تدريب:
+
+   ```py
+   >>> from transformers import TrainingArguments
+
+   >>> training_args = TrainingArguments(
+   ...     output_dir="path/to/save/folder/",
+   ...     learning_rate=2e-5,
+   ...     per_device_train_batch_size=8,
+   ...     per_device_eval_batch_size=8,
+   ...     num_train_epochs=2,
+   ... )
+   ```
+
+3. قم بتحميل فئة معالجة مسبقة مثل برنامج الترميز، أو معالج الصور، أو مستخرج الميزات، أو المعالج:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+4. قم بتحميل مجموعة بيانات:
+
+   ```py
+   >>> from datasets import load_dataset
+
+   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
+   ```
+
+5. قم بإنشاء دالة لترميز مجموعة البيانات:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])
+   ```
+
+   ثم قم بتطبيقه على مجموعة البيانات بأكملها باستخدام [`~datasets.Dataset.map`]:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset, batched=True)
+   ```
+
+6. [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة من مجموعة البيانات الخاصة بك:
+
+   ```py
+   >>> from transformers import DataCollatorWithPadding
+
+   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+   ```
+
+الآن قم بتجميع جميع هذه الفئات في [`Trainer`]:
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )  # doctest: +SKIP
+```
+عندما تكون مستعدًا، استدعِ [`~Trainer.train`] لبدء التدريب:
+
+```py
+>>> trainer.train()  # doctest: +SKIP
+```
+
+<Tip>
+
+بالنسبة للمهام - مثل الترجمة أو التلخيص - التي تستخدم نموذج تسلسل إلى تسلسل، استخدم فئات [`Seq2SeqTrainer`] و [`Seq2SeqTrainingArguments`] بدلاً من ذلك.
+
+</Tip>
+
+يمكنك تخصيص سلوك حلقة التدريب عن طريق إنشاء فئة فرعية من الطرق داخل [`Trainer`]. يسمح لك ذلك بتخصيص ميزات مثل دالة الخسارة، والمحسن، والمجدول. راجع مرجع [`Trainer`] للتعرف على الطرق التي يمكن إنشاء فئات فرعية منها.
+
+والطريقة الأخرى لتخصيص حلقة التدريب هي باستخدام [المستدعيات](./main_classes/callback). يمكنك استخدام المستدعيات للتكامل مع المكتبات الأخرى ومراقبة حلقة التدريب للإبلاغ عن التقدم أو إيقاف التدريب مبكرًا. لا تعدل المستدعيات أي شيء في حلقة التدريب نفسها. لتخصيص شيء مثل دالة الخسارة، تحتاج إلى إنشاء فئة فرعية من [`Trainer`] بدلاً من ذلك.
+
+## التدريب باستخدام TensorFlow
+
+جميع النماذج عبارة عن [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) قياسية، لذا يمكن تدريبها في TensorFlow باستخدام واجهة برمجة تطبيقات Keras. يوفر 🤗 Transformers طريقة [`~TFPreTrainedModel.prepare_tf_dataset`] لتحميل مجموعة البيانات الخاصة بك بسهولة كـ `tf.data.Dataset` حتى تتمكن من البدء في التدريب على الفور باستخدام دالتي `compile` و`fit` في Keras.
+
+1. ستبدأ بـ [`TFPreTrainedModel`] أو [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model):
+
+   ```py
+   >>> from transformers import TFAutoModelForSequenceClassification
+
+   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+2. قم بتحميل فئة معالجة مسبقة مثل برنامج الترميز، أو معالج الصور، أو مستخرج الميزات، أو المعالج:
+
+   ```py
+   >>> from transformers import AutoTokenizer
+
+   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+   ```
+
+3. قم بإنشاء دالة لترميز مجموعة البيانات:
+
+   ```py
+   >>> def tokenize_dataset(dataset):
+   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
+   ```
+
+4. قم بتطبيق برنامج الترميز على مجموعة البيانات بأكملها باستخدام [`~datasets.Dataset.map`] ثم مرر مجموعة البيانات وبرنامج الترميز إلى [`~TFPreTrainedModel.prepare_tf_dataset`]. يمكنك أيضًا تغيير حجم الدفعة وخلط مجموعة البيانات هنا إذا أردت:
+
+   ```py
+   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
+   >>> tf_dataset = model.prepare_tf_dataset(
+   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+   ... )  # doctest: +SKIP
+   ```
+
+5. عندما تكون مستعدًا، يمكنك استدعاء `compile` و`fit` لبدء التدريب. لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذا فأنت لست بحاجة إلى تحديد واحدة ما لم ترغب في ذلك:
+
+   ```py
+   >>> from tensorflow.keras.optimizers import Adam
+
+   >>> model.compile(optimizer='adam')  # لا توجد وسيطة دالة الخسارة!
+   >>> model.fit(tf_dataset)  # doctest: +SKIP
+   ```
+
+## ماذا بعد؟
+
+الآن بعد أن أكملت الجولة السريعة في 🤗 Transformers، راجع أدلتنا لمعرفة كيفية القيام بأشياء أكثر تحديدًا مثل كتابة نموذج مخصص، وضبط نموذج مسبق التدريب لمهمة معينة، وكيفية تدريب نموذج باستخدام نص برمجي. إذا كنت مهتمًا بمعرفة المزيد عن المفاهيم الأساسية لـ 🤗 Transformers، فاحصل على فنجان من القهوة واطلع على أدلة المفاهيم الخاصة بنا!
diff --git a/docs/transformers/docs/source/ar/run_scripts.md b/docs/transformers/docs/source/ar/run_scripts.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7aea4eb9611b0a6a7c83baf539d46cf12295caf
--- /dev/null
+++ b/docs/transformers/docs/source/ar/run_scripts.md
@@ -0,0 +1,351 @@
+# التدريب باستخدام نص برمجى
+
+بالإضافة إلى دفاتر الملاحظات [notebooks](./notebooks) الخاصة بـ 🤗 Transformers، هناك أيضًا نصوص برمجية توضيحية تُظهر كيفية تدريب نموذج لمهمة باستخدام [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch) أو [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) أو [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax).
+
+كما ستجد النصوص البرمجية التي استخدمناها في [مشاريع الأبحاث](https://github.com/huggingface/transformers-research-projects/) و [الأمثلة القديمة](https://github.com/huggingface/transformers/tree/main/examples/legacy) والتي ساهم بها المجتمع بشكل أساسي. هذه النصوص البرمجية غير مدعومة بشكل نشط وقد تتطلب إصدارًا محددًا من مكتبة 🤗 Transformers والذي من المحتمل أن يكون غير متوافق مع الإصدار الأحدث من المكتبة.
+
+لا يُتوقع أن تعمل النصوص البرمجية التوضيحية بشكل مباشر على كل مشكلة، وقد تحتاج إلى تكييف النص البرمجي مع المشكلة التي تحاول حلها. ولمساعدتك في ذلك، تعرض معظم النصوص البرمجية كيفية معالجة البيانات قبل التدريب بشكل كامل، مما يتيح لك تحريرها حسب الحاجة لحالتك الاستخدام.
+
+بالنسبة لأي ميزة ترغب في تنفيذها في نص برمجي توضيحي، يرجى مناقشتها في [المنتدى](https://discuss.huggingface.co/) أو في [قضية](https://github.com/huggingface/transformers/issues) قبل إرسال طلب سحب. وفي حين أننا نرحب بإصلاح الأخطاء، فمن غير المرجح أن نقوم بدمج طلب سحب الذي يضيف المزيد من الوظائف على حساب قابلية القراءة.
+
+سيوضح هذا الدليل كيفية تشغيل نص برمجي توضيحي للتدريب على التلخيص في [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) و [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization). يُتوقع أن تعمل جميع الأمثلة مع كلا الإطارين ما لم يُنص على خلاف ذلك.
+
+## الإعداد
+
+لتشغيل الإصدار الأحدث من النصوص البرمجية التوضيحية بنجاح، يجب عليك **تثبيت 🤗 Transformers من المصدر** في بيئة افتراضية جديدة:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+بالنسبة للإصدارات الأقدم من النصوص البرمجية التوضيحية، انقر فوق الزر أدناه:
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+بالنسبة للإصدارات الأقدم من النصوص البرمجية التوضيحية، انقر فوق الزر أدناه:
+
+<details>
+  <summary>أمثلة للإصدارات الأقدم من 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+ثم قم بالتبديل إلى النسخة الحالية من 🤗 Transformers إلى إصدار محدد، مثل v3.5.1 على سبيل المثال:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+بعد إعداد إصدار المكتبة الصحيح، انتقل إلى مجلد الأمثلة الذي تختاره وقم بتثبيت المتطلبات المحددة:
+
+```bash
+pip install -r requirements.txt
+```
+
+## تشغيل نص برمجي
+
+<frameworkcontent>
+<pt>
+    
+- يقوم النص البرمجي التوضيحي بتنزيل مجموعة بيانات ومعالجتها مسبقًا من مكتبة 🤗 [Datasets](https://huggingface.co/docs/datasets).
+- ثم يقوم النص البرمجي بضبط نموذج بيانات دقيق باستخدام [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) على بنية تدعم الملخص. 
+- يوضح المثال التالي كيفية ضبط نموذج [T5-small](https://huggingface.co/google-t5/t5-small) على مجموعة بيانات [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail).
+- يتطلب نموذج T5 معامل `source_prefix` إضافية بسبب الطريقة التي تم تدريبه بها. يتيح هذا المطالبة لـ T5 معرفة أن هذه مهمة التلخيص.
+    
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+    
+- يقوم النص البرمجي التوضيحي بتنزيل مجموعة بيانات ومعالجتها مسبقًا من مكتبة 🤗 [Datasets](https://huggingface.co/docs/datasets/).
+- ثم يقوم النص البرمجي بضبط نموذج بيانات دقيق باستخدام Keras على بنية تدعم الملخص.
+- يوضح المثال التالي كيفية ضبط نموذج [T5-small](https://huggingface.co/google-t5/t5-small) على مجموعة بيانات [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail).
+- يتطلب نموذج T5 ماعمل `source_prefix` إضافية بسبب الطريقة التي تم تدريبه بها. يتيح هذا المطالبة لـ T5 معرفة أن هذه مهمة التلخيص.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## التدريب الموزع والدقة المختلطة
+
+يدعم [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) التدريب الموزع والدقة المختلطة، مما يعني أنه يمكنك أيضًا استخدامه في نص برمجي. لتمكين كلتا الميزتين:
+
+- أضف معامل `fp16` لتمكين الدقة المختلطة.
+- قم بتعيين عدد وحدات معالجة الرسومات (GPUs) التي تريد استخدامها باستخدام حجة `nproc_per_node`.
+
+```bash
+torchrun \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+تستخدم نصوص TensorFlow البرمجية استراتيجية [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) للتدريب الموزع، ولا تحتاج إلى إضافة أي معامﻻت إضافية إلى النص البرمجي التدريبي. سيستخدم نص TensorFlow البرمجي وحدات معالجة الرسومات (GPUs) متعددة بشكل افتراضي إذا كانت متوفرة.
+
+## تشغيل نص برمجي على وحدة معالجة الدقة الفائقة (TPU)
+
+<frameworkcontent>
+<pt>
+    
+تُعد وحدات معالجة الدقة الفائقة (TPUs) مصممة خصيصًا لتسريع الأداء. يدعم PyTorch وحدات معالجة الدقة الفائقة (TPUs) مع [XLA](https://www.tensorflow.org/xla) مجمع الدقة الفائقة للتعلم العميق (راجع [هنا](https://github.com/pytorch/xla/blob/master/README.md) لمزيد من التفاصيل). لاستخدام وحدة معالجة الدقة الفائقة (TPU)، قم بتشغيل نص `xla_spawn.py` البرمجي واستخدم معامل `num_cores` لتعيين عدد وحدات معالجة الدقة الفائقة (TPU) التي تريد استخدامها.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+    
+تُعد وحدات معالجة الدقة الفائقة (TPUs) مصممة خصيصًا لتسريع الأداء. تستخدم نصوص TensorFlow البرمجية استراتيجية [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) للتدريب على وحدات معالجة الدقة الفائقة (TPUs). لاستخدام وحدة معالجة الدقة الفائقة (TPU)، قم بتمرير اسم مورد وحدة معالجة الدقة الفائقة (TPU) إلى حجة `tpu`.
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## تشغيل نص برمجي باستخدام 🤗 Accelerate
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) هي مكتبة خاصة بـ PyTorch فقط توفر طريقة موحدة لتدريب نموذج على عدة أنواع من الإعدادات (الاعتماد على وحدة المعالجة المركزية (CPU) فقط، أو وحدات معالجة الرسومات (GPUs) المتعددة، أو وحدات معالجة الدقة الفائقة (TPUs)) مع الحفاظ على الرؤية الكاملة لحلقة تدريب PyTorch. تأكد من تثبيت 🤗 Accelerate إذا لم يكن لديك بالفعل:
+
+> ملاحظة: نظرًا لأن Accelerate في حالة تطوير سريع، يجب تثبيت إصدار Git من Accelerate لتشغيل النصوص البرمجية.
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+بدلاً من إستخدام النص البرمجي `run_summarization.py`  يجب عليك استخدام النص البرمجي `run_summarization_no_trainer.py` . ستكون النصوص البرمجية المدعومة من 🤗 Accelerate لها ملف `task_no_trainer.py` في المجلد. ابدأ بتشغيل الأمر التالي لإنشاء وحفظ ملف تكوين:
+
+```bash
+accelerate config
+```
+
+اختبر إعدادك للتأكد من أنه تم تكوينه بشكل صحيح:
+
+```bash
+accelerate test
+```
+
+الآن أنت مستعد لبدء التدريب:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## استخدام مجموعة بيانات مخصصة
+
+يدعم النص البرمجي للتلخيص مجموعة بيانات مخصصة طالما أنها ملف CSV أو JSON Line. عندما تستخدم مجموعة بياناتك الخاصة، تحتاج إلى تحديد العديد من المعلمات الإضافية:
+
+- `train_file` و`validation_file` يحددان مسار ملفات التدريب والتحقق الخاصة بك.
+- `text_column`  النص المدخل الذي سيتم تلخيصه.
+- `summary_column`  النص الملخص المستهدف الذي سيتم إخراجه.
+
+سيبدو النص البرمجي للتلخيص الذي يستخدم مجموعة بيانات مخصصة على النحو التالي:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## اختبار البرنامج النصي
+
+من الجيد غالبًا تشغيل نصك البرمجي على عدد أقل من أمثلة مجموعة البيانات للتأكد من أن كل شيء يعمل كما هو متوقع قبل الالتزام بمجموعة بيانات كاملة والتي قد تستغرق ساعات لإكمالها. استخدم المعلمات التالية لتقليص مجموعة البيانات إلى عدد أقصى من العينات:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+لا تدعم جميع أمثلة النصوص البرمجية المعلمة `max_predict_samples`. إذا لم تكن متأكدًا مما إذا كان نصك البرمجي يدعم هذه المعلمة، فأضف معلمة `-h` للتحقق:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## استئناف التدريب من نقطة تفتيش
+
+خيار آخر مفيد لتمكينه هو استئناف التدريب من نقطة تفتيش سابقة. سيضمن ذلك أنك تستطيع الاستمرار من حيث توقفت دون البدء من جديد إذا تم مقاطعة تدريبك. هناك طريقتان لاستئناف التدريب من نقطة تفتيش.
+
+تستخدم الطريقة الأولى المعلمة `output_dir previous_output_dir` لاستئناف التدريب من أحدث نقطة تفتيش مخزنة في `output_dir`. في هذه الحالة، يجب عليك إزالة `overwrite_output_dir`:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+تستخدم الطريقة الثانية معلمة `resume_from_checkpoint path_to_specific_checkpoint` لاستئناف التدريب من مجلد نقطة تفتيش محددة.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## شارك نموذجك
+
+يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:
+
+```bash
+huggingface-cli login
+```
+
+ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
+
+لإعطاء مستودعك اسمًا محددًا، استخدم المعلمة `push_to_hub_model_id` لإضافته. سيتم عرض المستودع تلقائيًا ضمن مساحة الاسم الخاصة بك.
+
+يوضح المثال التالي كيفية رفع نموذج باستخدام اسم مستودع محدد:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
diff --git a/docs/transformers/docs/source/ar/sagemaker.md b/docs/transformers/docs/source/ar/sagemaker.md
new file mode 100644
index 0000000000000000000000000000000000000000..6bb53816baaaee483277076a54a530bfa2e9f278
--- /dev/null
+++ b/docs/transformers/docs/source/ar/sagemaker.md
@@ -0,0 +1,8 @@
+# تشغيل التدريب على Amazon SageMaker
+
+تم نقل التوثيق إلى [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). وسيتم إزالة هذه الصفحة في الإصدار 5.0 من برنامج Transformers.
+
+### جدول المحتويات
+
+- [تدريب نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [نشر نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/serialization.md b/docs/transformers/docs/source/ar/serialization.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f437dea0681ba787b3f62aafd0f5ee723388f63
--- /dev/null
+++ b/docs/transformers/docs/source/ar/serialization.md
@@ -0,0 +1,170 @@
+# التصدير إلى ONNX
+
+غالباً ما يتطلب نشر نماذج 🤗 Transformers في بيئات الإنتاج أو يمكن أن يستفيد من تصدير النماذج إلى تنسيق تسلسلي يُمكن تحميله وتنفيذه على أجهزة وبرامج تشغيل مُتخصصة.
+
+🤗 Optimum هو امتداد لـ Transformers يمكّن من تصدير النماذج من PyTorch أو TensorFlow إلى تنسيقات مُتسلسلة مثل ONNX و TFLite من خلال وحدة `exporters` الخاصة به. يوفر 🤗 Optimum أيضًا مجموعة من أدوات تحسين الأداء لتدريب النماذج وتشغيلها على أجهزة مستهدفة بكفاءة قصوى.
+
+يوضح هذا الدليل كيفية تصدير نماذج 🤗 Transformers إلى ONNX باستخدام 🤗 Optimum، وللحصول على الدليل الخاص بتصدير النماذج إلى TFLite، يُرجى الرجوع إلى صفحة [التصدير إلى TFLite](tflite).
+
+## التصدير إلى ONNX
+
+مجمد [ONNX (Open Neural Network Exchange)](http://onnx.ai) هو معيار مفتوح يُحدد مجموعة مشتركة من العوامل وتنسيق ملف مشترك لتمثيل نماذج التعلم العميق في مجموعة متنوعة واسعة من الأطر، بما في ذلك PyTorch وTensorFlow. عندما يتم تصدير نموذج إلى تنسيق ONNX، يتم استخدام هذه المشغلات لبناء رسم بياني حاسوبي (يُطلق عليه غالبًا اسم _تمثيل وسيط_) والذي يمثل تدفق البيانات عبر الشبكة العصبية.
+
+من خلال عرض رسم بياني بعوامل وأنواع بيانات معيارية، يُسهّل ONNX  التبديل بين الأطر. على سبيل المثال، يُمكن تصدير نموذج مدرب في PyTorch إلى تنسيق ONNX ثم استيراده في TensorFlow (والعكس صحيح).
+
+بمجرد التصدير إلى تنسيق ONNX، يُمكن:
+
+-  تحسين النموذج للاستدلال عبر تقنيات مثل [تحسين الرسم البياني](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) و [التكميم](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization).
+- تشغيله باستخدام ONNX Runtime عبر فئات [`ORTModelForXXX`](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)، والتي تتبع نفس واجهة برمجة التطبيقات (API) لـ `AutoModel` التي اعتدت عليها في 🤗 Transformers.
+- تشغيله باستخدام [قنوات معالجة الاستدلال مُحسّنة](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines)، والتي لها نفس واجهة برمجة التطبيقات (API) مثل وظيفة [`pipeline`] في 🤗 Transformers.
+
+يوفر 🤗 Optimum دعمًا لتصدير ONNX من خلال الاستفادة من كائنات التكوين. تأتي كائنات التكوين هذه جاهزة لعدد من معماريات النماذج، وقد تم تصميمها لتكون قابلة للتوسعة بسهولة إلى معماريات أخرى.
+
+للاطلاع على قائمة بالتكوينات الجاهزة، يُرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/overview).
+
+هناك طريقتان لتصدير نموذج 🤗 Transformers إلى ONNX،  نعرض هنا كليهما:
+
+- التصدير باستخدام 🤗 Optimum عبر واجهة سطر الأوامر (CLI).
+- التصدير باستخدام 🤗 Optimum مع `optimum.onnxruntime`.
+
+### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام واجهة سطر الأوامر
+
+لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:
+
+```bash
+pip install optimum[exporters]
+```
+
+للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
+
+```bash
+optimum-cli export onnx --help
+```
+```bash
+optimum-cli export onnx --help
+```
+
+لتصدير نقطة تفتيش نموذج من 🤗 Hub، على سبيل المثال، `distilbert/distilbert-base-uncased-distilled-squad`، قم بتشغيل الأمر التالي:
+
+```bash
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+يجب أن تشاهد السجلات التي تشير إلى التقدم المحرز وتظهر المكان الذي تم فيه حفظ ملف `model.onnx` الناتج، مثل هذا:
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+يوضح المثال أعلاه تصدير نقطة تفتيش من 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج ومحول الرموز في نفس الدليل (`local_path`). عند استخدام واجهة سطر الأوامر، قم بتمرير `local_path` إلى وسيط `model` بدلاً من اسم نقطة التفتيش على 🤗 Hub وقدم وسيط `--task`. يمكنك مراجعة قائمة المهام المدعومة في [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/task_manager). إذا لم يتم توفير وسيط `task`، فسيتم تعيينه افتراضيًا إلى هندسة النموذج دون أي رأس محدد للمهمة.
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+يمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد [المسرعات](https://onnx.ai/supported-tools.html#deployModel) العديدة التي تدعم معيار ONNX. على سبيل المثال، يمكننا تحميل النموذج وتشغيله باستخدام [ONNX Runtime](https://onnxruntime.ai/) كما يلي:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+تكون العملية مماثلة بالنسبة إلى نقاط تفتيش TensorFlow على Hub. على سبيل المثال، إليك كيفية تصدير نقطة تفتيش TensorFlow نقية من [منظمة Keras](https://huggingface.co/keras-io):
+
+```bash
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
+```
+
+### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام `optimum.onnxruntime`
+
+كبديل لواجهة سطر الأوامر، يُمكنك تصدير نموذج 🤗 Transformers إلى ONNX برمجيًا كما يلي:
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
+
+>>> # تحميل نموذج من transformers وتصديره إلى ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> # حفظ نموذج onnx ومجزىء النصوص
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
+
+### تصدير نموذج لهندسة غير مدعومة
+
+إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
+
+### تصدير نموذج باستخدام `transformers.onnx`
+
+<Tip warning={true}>
+
+لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
+
+</Tip>
+
+لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
+
+```bash
+pip install transformers[onnx]
+```
+
+استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
+يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/task_summary.md b/docs/transformers/docs/source/ar/task_summary.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6a9d1477e10fce12c5bcf2bda9c92635b4cd985
--- /dev/null
+++ b/docs/transformers/docs/source/ar/task_summary.md
@@ -0,0 +1,323 @@
+# ما الذي تستطيع مكتبة 🤗 Transformers القيام به؟
+
+مكتبة 🤗 Transformers هي مجموعة من النماذج المُدرّبة مسبقًا الأفضل في فئتها لمهام معالجة اللغة الطبيعية (NLP)، ورؤية الحاسوب، ومعالجة الصوت والكلام. لا تحتوي المكتبة فقط على نماذج المحولات (Transformer) فحسب، بل تشمل أيضًا نماذج أخرى لا تعتمد على المحولات مثل الشبكات العصبية التلافيفية الحديثة لمهام رؤية الحاسوب. إذا نظرت إلى بعض المنتجات الاستهلاكية الأكثر شيوعًا اليوم، مثل الهواتف الذكية والتطبيقات وأجهزة التلفاز، فمن المحتمل أن تقف وراءها تقنية ما من تقنيات التعلم العميق. هل تريد إزالة جسم من خلفية صورة التقطتها بهاتفك الذكي؟ هذا مثال على مهمة التجزئة البانورامية (Panoptic Segmentation) ( لا تقلق إذا لم تفهم معناها بعد، فسوف نشرحها في الأقسام التالية!).
+
+توفر هذه الصفحة نظرة عامة على مختلف مهام الكلام والصوت ورؤية الحاسوب ومعالجة اللغات الطبيعية المختلفة التي يمكن حلها باستخدام مكتبة 🤗 Transformers في ثلاثة أسطر فقط من التعليمات البرمجية!
+
+## الصوت
+
+تختلف مهام معالجة الصوت والكلام قليلاً عن باقي الوسائط، ويرجع ذلك ببشكل أساسي لأن الصوت كمدخل هو إشارة متصلة. على عكس النص، لا يمكن تقسيم الموجة الصوتية الخام بشكل مرتب في أجزاء منفصلة بالطريقة التي يمكن بها تقسيم الجملة إلى كلمات. وللتغلب على هذا، يتم عادةً أخذ عينات من الإشارة الصوتية الخام على فترات زمنية منتظمة. كلما زاد عدد العينات التي تؤخذ في فترة زمنية معينة، ارتفع معدل أخذ العينات (معدل التردد)، وصار الصوت أقرب إلى مصدر الصوت الأصلي.
+
+قامت الطرق السابقة بمعالجة الصوت لاستخراج الميزات المفيدة منه. أصبح من الشائع الآن البدء بمهام معالجة الصوت والكلام عن طريق تغذية شكل الموجة الصوتية الخام مباشرة في مشفر الميزات (Feature Encoder)  لاستخراج تمثيل صوتي له. وهذا يبسط خطوة المعالجة المسبقة ويسمح للنموذج بتعلم أهم الميزات.
+
+### تصنيف الصوت
+
+تصنيف الصوت (Audio Classification) هو مهمة يتم فيها تصنيف بيانات الصوت الصوت من مجموعة محددة مسبقًا من الفئات. إنه فئة واسعة تضم العديد من التطبيقات المحددة، والتي تشمل:
+
+* تصنيف المشهد الصوتي: وضع علامة على الصوت باستخدام تسمية المشهد ("المكتب"، "الشاطئ"، "الملعب")
+* اكتشاف الأحداث الصوتية: وضع علامة على الصوت باستخدام تسمية حدث صوتي ("بوق السيارة"، "صوت الحوت"، "كسر زجاج")
+* الوسم: وصنيف صوت يحتوي على أصوات متعددة (أصوات الطيور، وتحديد هوية المتحدث في اجتماع)
+* تصنيف الموسيقى: وضع علامة على الموسيقى بتسمية النوع ("ميتال"، "هيب هوب"، "كانتري")
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### التعرف التلقائي على الكلام
+
+يقوم التعرف التلقائي على الكلام (ASR) هو عملية تحويل الكلام إلى نص. إنه أحد أكثر المهام الصوتية شيوعًا ويرجع ذلك جزئيًا إلى أن الكلام وسيلة طبيعية للتواصل البشري. واليوم، يتم تضمين أنظمة ASR في منتجات التقنية "الذكية" مثل مكبرات الصوت والهواتف والسيارات. يمكننا أن نطلب من مساعدينا الافتراضيين تشغيل الموسيقى، وضبط التذكيرات، وإخبارنا بأحوال الطقس.
+ولكن أحد التحديات الرئيسية التي ساعدت نماذج المحولات (Transformer) في التغلب عليها هو التعامل مع اللغات منخفضة الموارد. فمن خلال التدريب المسبق على كميات كبيرة من بيانات الصوتية، يُمكن ضبط النموذج بدقة (Fine-tuning) باستخدام ساعة واحدة فقط من بيانات الكلام المُوسم في لغة منخفضة الموارد إلى نتائج عالية الجودة مقارنة بأنظمة ASR السابقة التي تم تدريبها على بيانات موسومة أكثر بـ 100 مرة.
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## رؤية الحاسب
+
+كانت إحدى أوائل مهام رؤية الحاسب وأنجحها هى التعرف على صور أرقام الرموز البريدية باستخدام [شبكة عصبية تلافيفية (CNN)](glossary#convolution). تتكون الصورة من وحدات بيكسل، ولكل بكسل قيمة رقمية. وهذا يجعل من السهل تمثيل صورة كمصفوفة من قيم البكسل. يصف كل مزيج معين من قيم البكسل ألوان الصورة.
+
+هناك طريقتان عامتان يمكن من خلالهما حل مهام رؤية الحاسب:
+
+1. استخدام الالتفافات (Convolutions) لتعلم الميزات الهرمية للصورة بدءًا من الميزات منخفضة المستوى وصولًا إلى الأشياء المجردة عالية المستوى.
+2. تقسيم الصورة إلى أجزاء واستخدام نموذج المحولات (Transformer) ليتعلم تدريجياً كيف ترتبط كل جزء صورة ببعضها البعض لتشكيل صورة. على عكس النهج ا التصاعدي (Bottom-Up) الذي تفضله الشبكات العصبية التلافيفية CNN، هذا يشبه إلى حد ما البدء بصورة ضبابية ثم جعلها أوضح تدريجيًا.
+
+### تصنيف الصور
+
+يقوم تصنيف الصور (Image Classification) بوضع علامة على صورة كاملة من مجموعة محددة مسبقًا من الفئات. مثل معظم مهام التصنيف، هناك العديد من التطبيقات العملية لتصنيف الصور، والتي تشمل:
+
+* الرعاية الصحية: تصنيف الصور الطبية للكشف عن الأمراض أو مراقبة صحة المريض
+* البيئة: تصنيف صور الأقمار الصناعية لرصد إزالة الغابات، أو إبلاغ إدارة الأراضي البرية أو اكتشاف حرائق الغابات
+* الزراعة: تصنيفر المحاصيل لمراقبة صحة النبات أو صور الأقمار الصناعية لمراقبة استخدام الأراضي
+* علم البيئة: تصنيف صور الأنواع الحيوانية أو النباتية لرصد أعداد  الكائنات الحية أو تتبع الأنواع المهددة بالانقراض
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### كشف الأجسام
+
+على عكس تصنيف الصور، يقوم كشف الأجسام (Object Detection) بتحديد عدة أجسام داخل صورة ومواضع هذه الأجسام في صورة (يحددها مربع الإحاطة). بعض تطبيقات كشف الأجسام تشمل:
+
+* المركبات ذاتية القيادة: اكتشاف أجسام المرورية اليومية مثل المركبات الأخرى والمشاة وإشارات المرور
+* الاستشعار عن بُعد: مراقبة الكوارث، والتخطيط الحضري، والتنبؤ بالطقس
+* اكتشاف العيوب: اكتشاف الشقوق أو الأضرار الهيكلية في المباني، وعيوب التصنيع
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### تجزئة الصور
+
+تجزئة الصورة (Image Segmentation)  هي مهمة على مستوى البكسل تقوم بتخصيص كل بكسل في صورة لفئة معينة. إنه يختلف عن كشف الأجسام، والذي يستخدم مربعات الإحاطة (Bounding Boxes)  لتصنيف والتنبؤ بالأجسام في الصورة لأن التجزئة أكثر دقة. يمكن لتجزئة الصور اكتشاف الأجسام على مستوى البكسل. هناك عدة أنواع من تجزئة الصور:
+
+* تجزئة مثيلات (Instance Segmentation): بالإضافة إلى تصنيف فئة كائن، فإنها تُصنّف أيضًا كل مثيل (Instance)  مميز لكائن ("الكلب-1"، "الكلب-2")
+* التجزئة  البانورامية (Panoptic Segmentation): مزيج من التجزئة الدلالية (Semantic Segmentation) وتجزئة المثيلات؛ فهو تُصنّف كل بكسل مع فئة دلالية **و** كل مثيل مميز لكائن
+
+تُعد مهام تجزئة الصور مفيدة في المركبات ذاتية القيادة على إنشاء خريطة على مستوى البكسل للعالم من حولها حتى تتمكن من التنقل بأمان حول المشاة والمركبات الأخرى. كما أنها مفيدة للتصوير الطبي، حيث يمكن للدقة العالية لهذ المهمة أن تساعد في تحديد الخلايا غير الطبيعية أو خصائص الأعضاء. يمكن أيضًا استخدام تجزئة الصور في التجارة الإلكترونية لتجربة الملابس افتراضيًا أو إنشاء تجارب الواقع المُعزز من خلال تراكب الأجسام في العالم الحقيقي من خلال الكاميرا الهاتف الخاصة بك.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### تقدير العمق
+
+يقوم تقدير العمق (Depth Estimation) بالتنبؤ بمسافة كل بكسل في صورة من الكاميرا. تُعد هذه المهمة لرؤية الحاسب هذه مهمة بشكل خاص لفهم وإعادة بناء المشهد. فعلى سبيل المثال، في السيارات ذاتية القيادة، تحتاج المركبات إلى فهم مدى بُعد الأجسام مثل المشاة ولافتات المرور والمركبات الأخرى لتجنب العقبات والاصطدامات. تساعد معلومات العمق أيضًا في بناء التمثيلات ثلاثية الأبعاد من الصور ثنائية الأبعاد ويمكن استخدامها لإنشاء تمثيلات ثلاثية الأبعاد عالية الجودة للهياكل البيولوجية أو المباني.
+
+هناك نهجان لتقدير العمق:
+
+* التصوير المجسم (Stereo): يتم تقدير العمق عن طريق مقارنة صورتين لنفس الصورة من زوايا مختلفة قليلاً.
+* التصوير الأحادي (Monocular): يتم تقدير العمق من صورة واحدة.
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## معالجة اللغات الطبيعية
+
+تُعد مهام معالجة اللغة الطبيعية (NLP) من بين أكثر أنواع المهام شيوعًا نظرًا لأن النص هو وسيلة طبيعية لنا للتواصل. ولكي يتمكن النموذج من فهم النص، يجب أولًا تحويله إلى صيغة رقمية. وهذا يعني تقسيم سلسلة النص إلى كلمات أو مقاطع كلمات منفصلة (رموز - Tokens)، ثم تحويل هذه الرموز إلى أرقام. ونتيجة لذلك، يمكنك تمثيل سلسلة من النص كتسلسل من الأرقام، وبمجرد حصولك على تسلسل من الأرقام، يمكن إدخاله إلى نموذج لحل جميع أنواع مهام معالجة اللغة الطبيعية!
+
+### تصنيف النصوص
+
+تمامًا مثل مهام التصنيف في أي مجال آخر، يقوم تصنيف  النصوص (Text Classification)  بتصنيف سلسلة نصية يمكن أن تكون جملة أو فقرة أو مستند) إلى فئة محددة مسبقًا. هناك العديد من التطبيقات العملية لتصنيف النصوص، والتي تشمل:
+
+* تحليل المشاعر (Sentiment Analysis): تصنيف النص وفقًا لمعيار معين مثل `الإيجابية` أو `السلبية` والتي يمكن أن تُعلم وتدعم عملية صنع القرار في مجالات مثل السياسة والتمويل والتسويق
+* تصنيف المحتوى (Content Classification): تصنيف النص وفقًا لبعض الموضوعات للمساعدة في تنظيم وتصفية المعلومات في الأخبار وموجزات الوسائط الاجتماعية (`الطقس`، `الرياضة`، `التمويل`، إلخ).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### تصنيف الرموز
+
+في أي مهمة من مهام معالجة اللغة الطبيعية NLP، تتم معالجة النص مسبقًا عن طريق تقسيمه إلى كلمات أو مقاطع كلمات فردية تُعرف باسم  [الرموز](glossary#token). يقوم تصنيف الرموز (Token Classification) بتخصيص تصنيف لكل رمز من مجموعة محددة مسبقًا من التصنيفات.
+
+هناك نوعان شائعان من تصنيف الرموز:
+
+* التعرف على الكيانات المسماة (NER):  تصنيف الرموز وفقًا لفئة الكيان مثل المنظمة أو الشخص أو الموقع أو التاريخ. يعد NER شائعًا بشكل خاص في الإعدادات الطبية الحيوية، حيث يُمكنه تصنيف الجينات والبروتينات وأسماء الأدوية.
+* ترميز الأجزاء اللغوية (POS): تصنيف الرموز وفقًا للدورها النحوي مثل الاسم أو الفعل أو الصفة. POS مفيد لمساعدة أنظمة الترجمة على فهم كيفية اختلاف كلمتين متطابقتين نحويًا  (مثل كلمة "عَلَمَ" كاسم و "عَلِمَ" كفعل).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+### الإجابة على الأسئلة
+
+تُعدّ مهمة الإجابة عن الأسئلة (Question Answering) مهمة أخرى على مستوى الرموز (Token-Level) تُرجع إجابة لسؤال ما، وقد تعتمد هذه الإجابة على سياق (في النطاق المفتوح - Open-Domain) أو لا تعتمد على سياق (في النطاق المغلق - Closed-Domain). تحدث هذه المهمة عندما نسأل مساعدًا افتراضيًا عن شيء ما، مثل معرفة ما إذا كان مطعمٌ ما مفتوحًا. يمكن أن تُقدّم هذه المهمة أيضًا دعمًا للعملاء أو دعمًا تقنيًا، كما تُساعد محركات البحث في استرجاع المعلومات ذات الصلة التي نبحث عنها.
+
+هناك نوعان شائعان من الإجابة على الأسئلة:
+
+* الاستخراجية (Extractive): بالنظر إلى سؤال وسياق مُعيّن، فإن الإجابة هي مقطع نصيّ مُستخرج من السياق الذي يُحلّله النموذج.
+* التجريدية (Abstractive): بالنظر إلى سؤال وسياق مُعيّن، يتم إنشاء الإجابة من السياق؛ يتعامل نهج [`Text2TextGenerationPipeline`] مع هذا النهج بدلاً من [`QuestionAnsweringPipeline`] الموضح أدناه
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### التلخيص
+
+ينشئ التلخيص (Summarization) نسخة مختصرة من نص طويل مع محاولة الحفاظ على معظم معنى النص الأصلي. التلخيص هو مهمة تسلسل إلى تسلسل(Sequence-to-Sequence)؛؛ فهو تُنتج تسلسلًا نصيًا أقصر من النص المُدخل. هناك الكثير من المستندات الطويلة التي يمكن تلخيصها لمساعدة القراء على فهم النقاط الرئيسية بسرعة. مشاريع القوانين والوثائق القانونية والمالية وبراءات الاختراع والأوراق العلمية هي مجرد أمثلة قليلة للوثائق التي يمكن تلخيصها لتوفير وقت القراء وخدمة كمساعد للقراءة.
+
+مثل الإجابة على الأسئلة، هناك نوعان من التلخيص:
+
+* الاستخراجية (Extractive): تحديد واستخراج أهم الجمل من النص الأصلي
+* التجريدي (Abstractive): إنشاء ملخص مستهدف (الذي قد يتضمن كلمات جديدة غير موجودة في النص الأصلي) انطلاقًا من النص الأصلي؛ يستخدم نهج التلخيص التجريدي [`SummarizationPipeline`]
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### الترجمة
+
+تحوّل الترجمة تسلسل نص بلغة إلى لغة أخرى. من المهم مساعدة الأشخاص من خلفيات مختلفة على التواصل مع بعضهم البعض، ومساعدة المحتوى على الوصول إلى جمهور أوسع، وحتى أن يكون أداة تعليمية لمساعدة الأشخاص على تعلم لغة جديدة. إلى جانب التلخيص، تعد الترجمة مهمة من نوع تسلسل إلى تسلسل، حيث يتلقى النموذج تسلسلًا مُدخلًا ويُعيد تسلسلًا مُخرَجًا مُستهدفًا.
+
+في الأيام الأولى، كانت نماذج الترجمة في الغالب أحادية اللغة، ولكن مؤخرًا، كان هناك اهتمام متزايد بالنماذج متعددة اللغات التي يمكنها الترجمة بين العديد من أزواج اللغات.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### نمذجة اللغة
+
+نمذجة اللغة (Language Modeling) هي مهمة التنبؤ بالكلمة التالية في تسلسل نصي. لقد أصبح مهمة NLP شائعة للغاية لأن النموذج اللغوي المسبق التدريب يمكن أن يتم ضبطه بشكل دقيق للعديد من مهام الأخرى. في الآونة الأخيرة، كان هناك الكثير من الاهتمام بنماذج اللغة الكبيرة (LLMs) التي توضح التعلم من الصفر أو من عدد قليل من الأمثلة (Zero-shot or Few-shot Learning). وهذا يعني أن النموذج يمكنه حل المهام التي لم يتم تدريبه عليها بشكل صريح! يمكن استخدام نماذج اللغة لإنشاء نص سلس ومقنع، على الرغم من أنه يجب أن تكون حذرًا لأن النص قد لا يكون دائمًا دقيقًا.
+
+هناك نوعان من نمذجة اللغة:
+
+* السببية(Causal): هدف النموذج هو التنبؤ بالرمز (Token)  التالي في التسلسل، ويتم إخفاء الرموز المستقبلية (Masking).
+
+```py
+>>> from transformers import pipeline
+
+>>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+>>> generator = pipeline(task="text-generation")
+>>> generator(prompt)  # doctest: +SKIP
+```
+
+* المقنّع (Masked): هدف النموذج هو التنبؤ برمز مُخفيّ ضمن التسلسل مع الوصول الكامل إلى الرموز  الأخرى في التسلسل
+
+```py
+>>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+>>> fill_mask = pipeline(task="fill-mask")
+>>> preds = fill_mask(text, top_k=1)
+>>> preds = [
+...     {
+...         "score": round(pred["score"], 4),
+...         "token": pred["token"],
+...         "token_str": pred["token_str"],
+...         "sequence": pred["sequence"],
+...     }
+...     for pred in preds
+... ]
+>>> preds
+[{'score': 0.2236,
+  'token': 1761,
+  'token_str': ' platform',
+  'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+```
+  
+## متعدد الوسائط:
+
+تتطلب المهام متعددة الوسائط (Multimodal) من النموذج معالجة وسائط بيانات متعددة (نص أو صورة أو صوت أو فيديو) لحل مشكلة معينة. يعد وصف الصورة (Image Captioning) مثالاً على مهمة متعددة الوسائط حيث يأخذ النموذج صورة كمدخل وينتج تسلسل نصيًا يصف الصورة أو بعض خصائصها.
+
+على الرغم من أن النماذج متعددة الوسائط تعمل مع أنواع أو وسائط بيانات مختلفة، إلا أن خطوات المعالجة المسبقة تساعد النموذج داخليًا على تحويل جميع أنواع البيانات إلى متجهات تضمين (Embeddings) (متجهات أو قوائم من الأرقام التي تحتوي على معلومات ذات معنى حول البيانات). بالنسبة لمهمة مثل وصف الصورة، يتعلم النموذج العلاقات بين متجهات تضمين الصور ومتجهات تضمين النص.
+
+### الإجابة على أسئلة المستندات:
+
+الإجابة على أسئلة المستندات  (Document Question Answering) هي مهمة تقوم بالإجابة على أسئلة اللغة الطبيعية من مستند مُعطى. على عكس مهمة الإجابة على الأسئلة على مستوى الرموز (Token-Level) التي تأخذ نصًا كمدخل، فإن الإجابة على أسئلة المستندات تأخذ صورة لمستند كمدخل بالإضافة إلى سؤال هذا حول المستند وتعيد الإجابة. يمكن استخدام الإجابة على أسئلة المستندات لتفسير المستندات المُنسّقة واستخراج المعلومات الرئيسية منها. في المثال أدناه، يمكن استخراج المبلغ الإجمالي والمبلغ المُسترد من إيصال الدفع..
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="ما هو المبلغ الإجمالي؟",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+نأمل أن تكون هذه الصفحة قد زودتك ببعض المعلومات الأساسية حول جميع أنواع المهام في كل طريقة وأهمية كل منها العملية. في القسم التالي، ستتعلم كيف تعمل مكتبة 🤗 Transformers لحل هذه المهام.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tasks/language_modeling.md b/docs/transformers/docs/source/ar/tasks/language_modeling.md
new file mode 100644
index 0000000000000000000000000000000000000000..24f2db00a7a69a446895fbf3e5c915bece3bc0c1
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/language_modeling.md
@@ -0,0 +1,422 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# نمذجة اللغة السببية (Causal language modeling)
+
+[[open-in-colab]]
+
+هناك نوعان من نمذجة اللغة، السببية والمقنعة. يوضح هذا الدليل نمذجة اللغة السببية.
+تُستخدم نماذج اللغة السببية غالبًا لتوليد النص. يمكنك استخدام هذه النماذج للتطبيقات الإبداعية مثل
+اختيار مغامرة النص الخاصة بك أو مساعد ترميز ذكي مثل Copilot أو CodeParrot.
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+تتنبأ نمذجة اللغة السببية بالرمز التالي في تسلسل من الرموز، ولا يمكن للنموذج سوى الاهتمام بالرموز على
+اليسار. هذا يعني أن النموذج لا يمكنه رؤية الرموز المستقبلية. GPT-2 هو مثال على نموذج اللغة السببية.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5).
+2.  استخدام النموذج المدرب الخاص بك للاستنتاج.
+
+<Tip>
+
+لرؤية جميع العمارات ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [task-page](https://huggingface.co/tasks/text-generation)
+
+</Tip>
+
+قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات ELI5
+
+ابدأ بتحميل أول 5000 مثال من [ELI5-Category](https://huggingface.co/datasets/eli5_category) مجموعة البيانات مع مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5_category", split="train[:5000]")
+```
+
+قم بتقسيم مجموعة بيانات `train` إلى مجموعتي تدريب واختبار باستخدام الخاصية [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> eli5["train"][0]
+{'q_id': '7h191n',
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
+ 'selftext': '',
+ 'category': 'Economics',
+ 'subreddit': 'explainlikeimfive',
+ 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
+  'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
+   'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
+   'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
+   'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
+  'score': [21, 19, 5, 3],
+  'text_urls': [[],
+   [],
+   [],
+   ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]},
+ 'title_urls': ['url'],
+ 'selftext_urls': ['url']}
+```
+
+على الرغم من أن هذا قد يبدو معقدًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة اللغة
+أنت لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية تعمل كتسمية.
+
+## معالجة مسبقة (Preprocess)
+
+<Youtube id="ma1TrR7gE7I"/>
+
+الخطوة التالية هي تحميل مجزء النص DistilGPT2 لمعالجة حقل `text` الفرعي:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+```
+
+ستلاحظ من المثال أعلاه، الحقل `text` هو في الواقع متداخل داخل `answers`. هذا يعني أنك ستحتاج إلى
+استخراج حقل `text` الفرعي من بنيته المتداخلة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'q_id': '7h191n',
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
+ 'selftext': '',
+ 'category': 'Economics',
+ 'subreddit': 'explainlikeimfive',
+ 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
+ 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
+  'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
+  'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
+  'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
+ 'answers.score': [21, 19, 5, 3],
+ 'answers.text_urls': [[],
+  [],
+  [],
+  ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']],
+ 'title_urls': ['url'],
+ 'selftext_urls': ['url']}
+```
+
+كل حقل فرعي هو الآن عموداً منفصلاً مسبوقاً بـ `answers`، وحقل `text` هو قائمة الآن. بدلاً من ذلك
+من تجزائة نص كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من تجزئة نصها بشكل مجمّع.
+
+هنا أول دالة معالجة مسبقة لدمج قائمة السلاسل لكل مثال ومجزىء النتيجة:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+لتطبيق دالة المعالجة المسبقة هذه على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع  هذه العملية `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد، وزيادة عدد العمليات مع `num_proc`. احذف أي أعمدة لا تحتاجها:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+تحتوي هذه المجموعة من البيانات على تسلسلات الرموز، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج.
+
+يمكنك الآن استخدام دالة ما قبل المعالجة ثانية لـ:
+
+- تجميع كل التسلسلات.
+- تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة، بحجم `block_size`، والتي يجب أن تكون أقصر من الطول الأقصى للمدخلات ومناسبة لذاكرة GPU.
+
+```py
+>>> block_size = 128
+
+>>> def group_texts(examples):
+...     # ربط جميع النصوص.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # نتجاهل الباقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك
+...     # تخصيص هذا الجزء حسب احتياجاتك.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # التقسيم إلى أجزاء بحجم block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+طبق دالة `group_texts` على كامل المجموعة من البيانات:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأفضل أن تقوم بـ *الحشو الديناميكي* للجمل إلى الطول الأطول في الدفعة أثناء التجميع، بدلاً من حشو كامل المجموعة من البيانات إلى الطول الأقصى.
+
+<frameworkcontent>
+<pt>
+استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+</pt>
+<tf>
+استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+</tf>
+</frameworkcontent>
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن على دراية بتدريب نموذج باستخدام [`Trainer`], اطلع على [البرنامج التعليمي الأساسي](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز الآن لبدء تدريب نموذجك! قم بتحميل DistilGPT2 باستخدام [`AutoModelForCausalLM`]:
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد أين سيتم حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub بتحديد `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك).
+2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، والمجموعات من البيانات، ومجمّع البيانات.
+3. قم باستدعاء [`~Trainer.train`] لتدريب نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_clm-model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+...     tokenizer=tokenizer,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم نموذجك والحصول على احتمالية الارتباك:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 49.61
+```
+
+ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن على دراية بتدريب نموذج باستخدام Keras، اطلع على [البرنامج التعليمي الأساسي](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لتدريب نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معاملات التدريب:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل DistilGPT2 باستخدام [`TFAutoModelForCausalLM`]:
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+```
+
+حول مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة الافتراضية، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # لا يوجد حجة للخسارة!
+```
+
+يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومجمّع البيانات في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_clm-model",
+...     tokenizer=tokenizer,
+... )
+```
+
+أخيراً، أنت جاهز لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العصور، والتعليقات الخاصة بك لتدريب النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية تدريب نموذج للنمذجة اللغوية السببية، اطلع على الدفتر المقابل
+[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+
+</Tip>
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بتدريب نموذج، يمكنك استخدامه للاستدلال!
+
+قم بابتكار سؤال تود توليد نص منه:
+
+```py
+>>> prompt = "Somatic hypermutation allows the immune system to"
+```
+
+أبسط طريقة لتجربة نموذجك المدرب للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتوليد النص مع نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline("text-generation", model="username/my_awesome_eli5_clm-model")
+>>> generator(prompt)
+[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
+```
+
+<frameworkcontent>
+<pt>
+قسم النص وإرجع `input_ids` كتنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+استخدم طريقة [`~generation.GenerationMixin.generate`] لتوليد النص.
+للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies).
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+فك ترميز الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
+```
+</pt>
+<tf>
+قم بتقسيم النص وإرجاع `input_ids` كـ TensorFlow tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الملخص. للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies).
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model")
+>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+فك ترميز  الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tasks/masked_language_modeling.md b/docs/transformers/docs/source/ar/tasks/masked_language_modeling.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8382927d1e6004190ee66da843e41b684fc7971
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/masked_language_modeling.md
@@ -0,0 +1,442 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# نمذجة اللغة المقنعة (Masked language modeling)
+
+[[open-in-colab]]
+
+<Youtube id="mqElG5QJWUg"/>
+
+تتنبأ نمذجة اللغة المقنعة برمز مقنع في تسلسل، ويمكن للنموذج الانتباه إلى الرموز بشكل ثنائي الاتجاه. هذا
+يعني أن النموذج لديه إمكانية الوصول الكاملة إلى الرموز الموجودة على اليسار واليمين. تعد نمذجة اللغة المقنعة ممتازة للمهام التي
+تتطلب فهمًا سياقيًا جيدًا لتسلسل كامل. BERT هو مثال على نموذج لغة مقنع.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. تكييف [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5).
+2. استخدام نموذج المدرب الخاص بك للاستدلال.
+
+<Tip>
+
+لمعرفة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/fill-mask)
+
+</Tip>
+
+قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما تتم مطالبتك، أدخل رمزك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات ELI5
+
+ابدأ بتحميل أول 5000 مثال من مجموعة بيانات [ELI5-Category](https://huggingface.co/datasets/eli5_category) باستخدام مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5_category", split="train[:5000]")
+```
+
+قم بتقسيم مجموعة البيانات `train` إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> eli5["train"][0]
+{'q_id': '7h191n',
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
+ 'selftext': '',
+ 'category': 'Economics',
+ 'subreddit': 'explainlikeimfive',
+ 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
+  'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
+   'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
+   'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
+   'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
+  'score': [21, 19, 5, 3],
+  'text_urls': [[],
+   [],
+   [],
+   ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]},
+ 'title_urls': ['url'],
+ 'selftext_urls': ['url']}
+```
+
+على الرغم من أن هذا قد يبدو كثيرًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة اللغة هو أنك لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية *هي* التسمية.
+
+## معالجة مسبقة (Preprocess)
+
+<Youtube id="8PmhEIXhBvI"/>
+
+بالنسبة لنمذجة اللغة المقنعة، فإن الخطوة التالية هي تحميل معالج DistilRoBERTa لمعالجة حقل `text` الفرعي:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
+```
+
+ستلاحظ من المثال أعلاه، أن حقل `text` موجود بالفعل داخل `answers`. هذا يعني أنك ستحتاج إلى استخراج حقل `text` الفرعي من بنيته المضمنة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'q_id': '7h191n',
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
+ 'selftext': '',
+ 'category': 'Economics',
+ 'subreddit': 'explainlikeimfive',
+ 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
+ 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
+  'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
+  'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
+  'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
+ 'answers.score': [21, 19, 5, 3],
+ 'answers.text_urls': [[],
+  [],
+  [],
+  ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']],
+ 'title_urls': ['url'],
+ 'selftext_urls': ['url']}
+```
+
+كل حقل فرعي هو الآن عمود منفصل كما هو موضح بواسطة بادئة `answers`، وحقل `text` هو قائمة الآن. بدلاً من
+معالجة كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من معالجتها بشكل مشترك.
+
+هنا أول دالة معالجة مسبقة لربط قائمة السلاسل لكل مثال ومعالجة النتيجة:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عدة عناصر في وقت واحد، وزيادة عدد العمليات باستخدام `num_proc`. احذف أي أعمدة غير ضرورية:
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+
+تحتوي مجموعة البيانات هذه على تسلسلات رمزية، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج.
+
+يمكنك الآن استخدام دالة معالجة مسبقة ثانية لـ:
+- تجميع جميع التسلسلات
+- تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة بـ `block_size`، والتي يجب أن تكون أقصر من الحد الأقصى لطول المدخلات ومناسبة لذاكرة GPU.
+
+```py
+>>> block_size = 128
+
+>>> def group_texts(examples):
+...     # تجميع جميع النصوص.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # نتجاهل الجزء المتبقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك
+...     # تخصيص هذا الجزء حسب احتياجاتك.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # تقسيمها إلى أجزاء بحجم block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     return result
+```
+
+طبق دالة `group_texts` على مجموعة البيانات بأكملها:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+الآن، قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأكثر كفاءة أن تقوم بـ *الحشو الديناميكي* ليصل طولها إلى أطول جملة في الدفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الطول الأقصى.
+
+<frameworkcontent>
+<pt>
+
+استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+```
+</pt>
+<tf>
+
+استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات:
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilRoBERTa باستخدام [`AutoModelForMaskedLM`]:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` والتي تحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك).
+2. قم بتمرير معلمات التدريب إلى [`Trainer`] مع النموذج، ومجموعات البيانات، ومجمّع البيانات.
+3. قم باستدعاء [`~Trainer.train`] لتعديل نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+...     tokenizer=tokenizer,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم النموذج والحصول على مقياس
+    الحيرة:
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 8.76
+```
+
+ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لتعديل نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل DistilRoBERTa باستخدام [`TFAutoModelForMaskedLM`]:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
+```
+
+قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers لديها جميعها دالة خسارة افتراضية ذات صلة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة ما لم تكن تريد ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # لا توجد حجة للخسارة!
+```
+
+يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالج الرموز في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+أخيراً، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد العصور، والتعليقات الخاصة بك لتعديل النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائياً إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+لمثال أكثر تفصيلاً حول كيفية تعديل نموذج للنمذجة اللغوية المقنعة، ألق نظرة على الدفتر المقابل
+[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+
+</Tip>
+
+## الاستدلال
+
+رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!
+
+جهّز بعض النصوص التي تريد أن يملأ النموذج الفراغات فيها، واستخدم الرمز الخاص `<mask>` للإشارة إلى الفراغ:
+
+```py
+>>> text = "The Milky Way is a <mask> galaxy."
+```
+
+أبسط طريقة لتجربة نموذجك المعدل للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن  `pipeline` لملء الفراغ مع نموذجك، ومرر نصك إليه. إذا أردت، يمكنك استخدام معلمة `top_k` لتحديد عدد التنبؤات التي تريد إرجاعها:
+
+```py
+>>> from transformers import pipeline
+
+>>> mask_filler = pipeline("fill-mask", "username/my_awesome_eli5_mlm_model")
+>>> mask_filler(text, top_k=3)
+[{'score': 0.5150994658470154,
+  'token': 21300,
+  'token_str': ' spiral',
+  'sequence': 'The Milky Way is a spiral galaxy.'},
+ {'score': 0.07087188959121704,
+  'token': 2232,
+  'token_str': ' massive',
+  'sequence': 'The Milky Way is a massive galaxy.'},
+ {'score': 0.06434620916843414,
+  'token': 650,
+  'token_str': ' small',
+  'sequence': 'The Milky Way is a small galaxy.'}]
+```
+
+<frameworkcontent>
+<pt>
+قم بتجزئة النص وإرجاع `input_ids` كمتجهات PyTorch. ستحتاج أيضًا إلى تحديد موضع رمز `<mask>`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+```
+
+قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع:
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها:
+
+```py
+>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</pt>
+<tf>
+قم بتقسيم النص إلى رموز وإرجاع `input_ids` كـ TensorFlow tensors. ستحتاج أيضًا إلى تحديد موضع رمز `<mask>`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+```
+
+قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع:
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها:
+
+```py
+>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tasks/multiple_choice.md b/docs/transformers/docs/source/ar/tasks/multiple_choice.md
new file mode 100644
index 0000000000000000000000000000000000000000..78f98560754f11d79394fb64881f676506a40db0
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/multiple_choice.md
@@ -0,0 +1,452 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الاختيار من متعدد (Multiple choice)
+
+[[open-in-colab]]
+
+مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased)  باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق.
+2. استخدام النموذج المضبوط للاستدلال.
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SWAG
+
+ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً:
+
+- `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`.
+- `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة.
+- `label`: يحدد نهاية الجملة الصحيحة.
+
+## المعالجة المسبقة (Preprocess)
+
+الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى:
+
+1.  إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة.
+2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة.
+3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة.
+
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+ 
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب.
+2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`.
+3. استدعي [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     processing_class=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+>>> model.compile(optimizer=optimizer)  # لا توجد وسيطة خسارة!
+```
+
+الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks)
+
+مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم قم بتضمين الاستدعاءات معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل.
+
+</Tip>
+
+## الاستدلال  (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+قم بإنشاء نص واقتراح إجابتين محتملتين:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+0
+```
+</pt>
+<tf>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+مرر مدخلاتك إلى النموذج وأعد القيم logits:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+0
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/ar/tasks/question_answering.md b/docs/transformers/docs/source/ar/tasks/question_answering.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c4b66443d814aff4416221497ae409c7aa77655
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/question_answering.md
@@ -0,0 +1,432 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+#  الإجابة على الأسئلة (Question answering)
+
+[[open-in-colab]]
+
+<Youtube id="ajPx5LwJD-I"/>
+
+تُقدّم مهام الإجابة على الأسئلة إجابةً بناءً على سؤال. إذا سبق لك أن سألت مساعدًا افتراضيًا مثل Alexa أو Siri أو Google عن حالة الطقس، فأنت قد استخدمت نموذج للإجابة على الأسئلة من قبل. هناك نوعان شائعان لمهام الإجابة على الأسئلة:
+
+- الاستخراجية: استخراج الإجابة من السياق المحدد.
+- التلخيصية: إنشاء إجابة من السياق تجيب على السؤال بشكل صحيح.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [SQuAD](https://huggingface.co/datasets/squad) للإجابة على الأسئلة الاستخراجية.
+2. استخدام النموذج المضبوط للاستدلال.
+
+<Tip>
+
+لمشاهدة جميع الهياكل والنسخ المتوافقة مع هذه المهمة، نوصي بالرجوع إلى [صفحة المهمة](https://huggingface.co/tasks/question-answering)
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SQuAD
+
+ابدأ بتحميل جزء أصغر من مجموعة بيانات SQuAD من مكتبة 🤗 Datasets. سيتيح لك ذلك فرصة للتجربة والتحقق من عمل كل شيء بشكل صحيح قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+قم بتقسيم تقسيم `train` لمجموعة البيانات إلى مجموعة تدريب واختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+هناك العديد من الحقول المهمة هنا:
+
+- `answers`: موقع بداية الرمز المميز للإجابة ونص الإجابة.
+- `context`: معلومات أساسية يحتاج النموذج إلى استخراج الإجابة منها.
+- `question`: السؤال الذي يجب على النموذج الإجابة عليه.
+
+## المعالجة المسبقة (Preprocess)
+
+<Youtube id="qgaM0weJHpA"/>
+
+الخطوة التالية هي تحميل المحلل اللغوى DistilBERT لمعالجة حقلي `question` و `context`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+هناك بعض خطوات المعالجة المسبقة الخاصة بمهام الإجابة على الأسئلة التي يجب أن تكون على دراية بها:
+
+1. قد تحتوي بعض الأمثلة في مجموعة البيانات على `context` طويلًا يتجاوز الحد الأقصى لطول مدخل النموذج. للتعامل مع النصوص الأطول، يتم اقتطاع `context` فقط عن طريق تعيين `truncation="only_second"`.
+2. بعد ذلك، يتم تحديد مواضع بداية ونهاية الإجابة في `context` الأصلي عن طريق تعيين
+   `return_offset_mapping=True`.
+3. باستخدام التعيين، يمكن الآن تحديد رموز بداية ونهاية الإجابة. استخدم طريقة [`~tokenizers.Encoding.sequence_ids`]
+   لتحديد أجزاء الإزاحة التي تتوافق مع `question` و `context`.
+
+فيما يلي كيفية إنشاء دالة لقص وتعيين رموز البداية والنهاية لـ `answer` إلى `context`:
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Find the start and end of the context
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # If the answer is not fully inside the context, label it (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # Otherwise it's the start and end token positions
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+لتطبيق المعالجة المسبقة على كامل مجموعة البيانات، استخدم [`~datasets.Dataset.map`] من مكتبة 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات دفعة واحدة. قم بإزالة أي أعمدة لا تحتاجها:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DefaultDataCollator`]. بخلاف مجمّعات البيانات الأخرى في 🤗 Transformers، لا يطبق [`DefaultDataCollator`] أي معالجة مسبقة إضافية مثل الحشو.
+
+<frameworkcontent>
+<pt>
+ 
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+ 
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], ألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل DistilBERT باستخدام [`AutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد المعاملات الفائقة للتدريب في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك).
+2. مرر معاملات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة البيانات، والمُحلّل النصي، ومُجمّع البيانات.
+3. استدعِ ـ [`~Trainer.train`] لضبط النموذج.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_qa_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك في Hub باستخدام الدالة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+ 
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن، وجدول معدل التعلم، وبعض المعاملات الفائقة للتدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+ثم يمكنك تحميل DistilBERT باستخدام [`TFAutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+آخر شيء يجب إعداده قبل بدء التدريب هو توفير طريقة لدفع نموذجك إلى Hub. يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالجك المعجمي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_qa_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العهود، ومعاودة الاتصال الخاصة بك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للإجابة على الأسئلة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+
+</Tip>
+
+## التقييم (Evaluate)
+
+يتطلب التقييم للإجابة على الأسئلة قدرًا كبيرًا من المعالجة اللاحقة. لتوفير وقتك، يتخطى هذا الدليل خطوة التقييم. لا يزال [`Trainer`] يحسب خسارة التقييم أثناء التدريب، مما يعني أنك لست تجهل تمامًا أداء نموذجك.
+
+إذا كان لديك المزيد من الوقت وتهتم بكيفية تقييم نموذجك للإجابة على الأسئلة، فألق نظرة على فصل [الإجابة على الأسئلة](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) من دورة 🤗 Hugging Face!
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+حدد سؤالًا وسياقًا ليقوم النموذج بالتنبؤ بالإجابة عليه:
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+أبسط طريقة لتجربة نموذجك المُدرَّب للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن لـ `pipeline` للإجابة على الأسئلة باستخدام نموذجك، ومرِّر النص إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 مليار معامل ويمكنه إنشاء نصوص بـ 46 لغة طبيعية و 13'}
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+ 
+ قسّم النص وأرجع تنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+مرر مدخلاتك إلى النموذج وأرجع `logits`:
+
+```py
+>>> import torch
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</pt>
+<tf>
+قم بتحليل النص المعجمي وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="tf")
+```
+
+مرر مدخلاتك إلى النموذج وأعد `logits`:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/ar/tasks/sequence_classification.md b/docs/transformers/docs/source/ar/tasks/sequence_classification.md
new file mode 100644
index 0000000000000000000000000000000000000000..a98964957b47c92bc4a43db701db99b650a2dfce
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/sequence_classification.md
@@ -0,0 +1,387 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# تصنيف النص(Text classification)
+
+[[open-in-colab]]
+
+<Youtube id="leNG9fN9FQU"/>
+
+تصنيف النص هو مهمة NLP شائعة حيث يُعيّن تصنيفًا أو فئة للنص. تستخدم بعض أكبر الشركات تصنيف النصوص في الإنتاج لمجموعة واسعة من التطبيقات العملية. أحد أكثر أشكال تصنيف النص شيوعًا هو تحليل المشاعر، والذي يقوم بتعيين تسمية مثل 🙂 إيجابية، 🙁 سلبية، أو 😐 محايدة لتسلسل نصي.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [IMDb](https://huggingface.co/datasets/imdb) لتحديد ما إذا كانت مراجعة الفيلم إيجابية أو سلبية.
+2. استخدام نموذج الضبط الدقيق للتنبؤ.
+
+<Tip>
+
+لرؤية جميع البنى ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/text-classification).
+
+</Tip>
+
+قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate accelerate
+```
+
+نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات IMDb
+
+ابدأ بتحميل مجموعة بيانات IMDb من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> imdb = load_dataset("imdb")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> imdb["test"][0]
+{
+    "label": 0,
+    "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
+}
+```
+
+هناك حقولان في هذه المجموعة من البيانات:
+
+- `text`: نص مراجعة الفيلم.
+- `label`: قيمة إما `0` لمراجعة سلبية أو `1` لمراجعة إيجابية.
+
+## المعالجة المسبقة(Preprocess)
+
+الخطوة التالية هي تحميل المُجزِّئ النص DistilBERT لتهيئة لحقل `text`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+أنشئ دالة لتهيئة حقل `text` وتقصير السلاسل النصية بحيث لا يتجاوز طولها الحد الأقصى لإدخالات DistilBERT:
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer(examples["text"], truncation=True)
+```
+
+لتطبيق دالة التهيئة على مجموعة البيانات بأكملها، استخدم دالة 🤗 Datasets [`~datasets.Dataset.map`] . يمكنك تسريع `map` باستخدام `batched=True` لمعالجة دفعات من البيانات:
+
+```py
+tokenized_imdb = imdb.map(preprocess_function, batched=True)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].  الأكثر كفاءة هو استخدام الحشو الديناميكي لجعل الجمل متساوية في الطول داخل كل دفعة، بدلًا من حشو كامل البيانات إلى الحد الأقصى للطول.
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorWithPadding
+
+>>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم(Evaluate)
+
+يُعدّ تضمين مقياس أثناء التدريب مفيدًا لتقييم أداء النموذج. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) . بالنسبة لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (راجع جولة 🤗 Evaluate [السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+ثم أنشئ دالة تقوم بتمرير تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+دالة `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد التدريب.
+
+## التدريب(Train)
+
+قبل أن تبدأ في تدريب نموذجك، قم بإنشاء خريطة من المعرفات المتوقعة إلى تسمياتها باستخدام `id2label` و `label2id`:
+
+```py
+>>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
+>>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+إذا لم تكن على دراية بضبط نموذج دقيق باستخدام [`Trainer`], فالق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForSequenceClassification`] جنبًا إلى جنب مع عدد التصنيفات المتوقعة، وتصنيفات الخرائط:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+في هذه المرحلة، هناك ثلاث خطوات فقط متبقية:
+
+1.  حدد مُعامِلات التدريب في [`TrainingArguments`]. المُعامل المطلوب الوحيد هو `output_dir`، لتحديد مكان حفظ النموذج. يمكنك رفع النموذج إلى Hub بتعيين `push_to_hub=True` (يجب تسجيل الدخول إلى Hugging Face لرفع النموذج). سيقوم `Trainer` بتقييم الدقة وحفظ نقاط التحقق في نهاية كل حقبة.
+2.  مرر مُعامِلات التدريب إلى `Trainer` مع النموذج، ومجموعة البيانات، والمحلل اللغوي، ومُجمِّع البيانات، ووظيفة `compute_metrics`.
+3.  استدعِ [`~Trainer.train`] لضبط النموذج.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_imdb["train"],
+...     eval_dataset=tokenized_imdb["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+<Tip>
+
+يستخدم [`Trainer`] الحشو الديناميكي افتراضيًا عند تمرير `tokenizer` إليه. في هذه الحالة،  لا تحتاج لتحديد مُجمِّع البيانات صراحةً. 
+
+</Tip>
+
+بعد اكتمال التدريب، شارك نموذجك على Hub باستخدام الطريقة [`~transformers.Trainer.push_to_hub`] ليستخدمه الجميع:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن على دراية بضبط نموذج باستخدام Keras، قم بالاطلاع على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+>>> import tensorflow as tf
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
+>>> total_train_steps = int(batches_per_epoch * num_epochs)
+>>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForSequenceClassification`] بالإضافة إلى عدد التصنيفات المتوقعة، وتعيينات التسميات:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
+... )
+```
+
+قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_imdb["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_imdb["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب الدقة من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks).
+
+قم بتمرير دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك والمجزئ اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع الاستدعاءات معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد الحقبات، واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر عمقًا حول كيفية ضبط نموذج لتصنيف النصوص، قم بالاطلاع على الدفتر المقابل
+[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
+أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
+
+</Tip>
+
+## الاستدلال(Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+احصل على بعض النصوص التي ترغب في إجراء الاستدلال عليها:
+
+```py
+>>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
+```
+
+أسهل طريقة لتجربة النموذج المضبوط للاستدلال هي استخدامه ضمن [`pipeline`]. قم بإنشاء `pipeline` لتحليل المشاعر مع نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
+>>> classifier(text)
+[{'label': 'POSITIVE', 'score': 0.9994940757751465}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قم يتجزئة النص وإرجاع تنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+مرر المدخلات إلى النموذج واسترجع `logits`:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي:
+
+```py
+>>> predicted_class_id = logits.argmax().item()
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</pt>
+<tf>
+قم بتحليل النص وإرجاع تنسيقات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+قم بتمرير مدخلاتك إلى النموذج وإرجاع `logits`:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
+>>> logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي:
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'POSITIVE'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/ar/tasks/summarization.md b/docs/transformers/docs/source/ar/tasks/summarization.md
new file mode 100644
index 0000000000000000000000000000000000000000..17dbcb42e8374e6ed43b1ccb43d46afdbbb90635
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/summarization.md
@@ -0,0 +1,397 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# التلخيص (Summarization)
+
+[[open-in-colab]]
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+يقوم التلخيص بإنشاء نسخة مختصرة من مستند أو مقال، حيث يلتقط جميع المعلومات المهمة. بالإضافة إلى الترجمة، يعتبر التلخيص مثالاً آخر على مهمة يمكن صياغتها كتسلسل إلى تسلسل. يمكن أن يكون التلخيص:
+
+- استخراجي: استخراج أهم المعلومات من مستند.
+- تجريدي: إنشاء نص جديد يلخص أهم المعلومات.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق [T5](https://huggingface.co/google-t5/t5-small) على مجموعة فرعية من مشاريع قوانين ولاية كاليفورنيا من مجموعة بيانات [BillSum](https://huggingface.co/datasets/billsum) للتلخيص التجريدي.
+2. استخدام النموذج المضبوط بدقة للتنبؤ.
+
+<Tip>
+
+لمشاهدة جميع البنى ونقاط التفتيش المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/summarization)
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate rouge_score
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات BillSum
+
+ابدأ بتحميل جزء صغير من بيانات مشاريع القوانين الخاصة بولاية كاليفورنيا من مجموعة بيانات BillSum في مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+قسّم مجموعة البيانات إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+ثم ألقِ نظرة على مثال:
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+هناك مُدخلان سترغب في استخدامهما:
+
+- `text`: نص القانون الذي سيكون مُدخلًا للنموذج.
+- `summary`: نسخة مُختصرة من `text` والتي ستكون هدف النموذج.
+
+## المعالجة المسبقة (Preprocess)
+
+الخطوة التالية هي تحميل مجزء النصوص T5 لمعالجة `text` و `summary`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "google-t5/t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+وظيفة المعالجة المسبقة التي تريد إنشاءها تحتاج إلى:
+
+1. إضافة بادئة للمُدخل باستخدام توجيه حتى يعرف T5 أن هذه مهمة تلخيص. تتطلب بعض النماذج القادرة على مهام البرمجة اللغوية العصبية المتعددة توجيهات لمهام مُحددة.
+2. استخدام مُعامل الكلمة الرئيسية `text_target` عند ترميز التصنيفات.
+3. قصّ التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي تم تعيينه بواسطة مُعامل `max_length`.
+
+```py
+>>> prefix = "summarize: "
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`].  الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء عملية التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+يُعد تضمين مقياس أثناء التدريب مفيدًا غالبًا لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) الخاصة بـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> rouge = evaluate.load("rouge")
+```
+
+ثم قم بإنشاء دالة تُمرر تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب مقياس ROUGE:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+...     result["gen_len"] = np.mean(prediction_lens)
+
+...     return {k: round(v, 4) for k, v in result.items()}
+```
+
+دالة `compute_metrics` الخاصة بك جاهزة الآن، وستعود إليها عند إعداد التدريب الخاص بك.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`]، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+في هذه المرحلة، لم يتبق سوى ثلاث خطوات:
+
+1. حدد مُعامِلات التدريب الخاصة بك في [`Seq2SeqTrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يُحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (تحتاج إلى تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس ROUGE وحفظ نقطة تفتيش التدريب.
+2. مرر مُعامِلات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمُحلِّل اللغوي وجامع البيانات ودالة `compute_metrics`.
+3. استدعِ [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_billsum_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=4,
+...     predict_with_generate=True,
+...     fp16=True, #change to bf16=True for XPU
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_billsum["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_billsum["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة افتراضيًا، لذلك لست بحاجة إلى تحديد واحدة ما لم تكن ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب درجة ROUGE من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
+```
+
+حدد مكان دفع نموذجك ومُحلِّلك اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_billsum_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع استدعاءاتك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للتجميع، ألقِ نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb) المقابل.
+
+</Tip>
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+خدد بعض النصوص الذي ترغب في تلخيصها. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مُدخلاتك اعتمادًا على المهمة التي تعمل عليها. بالنسبة التلخيص، يجب عليك إضافة بادئة إلى مُدخلاتك كما هو موضح أدناه:
+
+```py
+>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
+```
+
+أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. استخدم `pipeline` للتلخيص باستخدام نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline("summarization", model="username/my_awesome_billsum_model")
+>>> summarizer(text)
+[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قسم النص وإرجع `input_ids` كتنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+استخدم طريقة [`~generation.GenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</pt>
+<tf>
+قسم النص وإرجع `input_ids` كتنسورات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tasks/token_classification.md b/docs/transformers/docs/source/ar/tasks/token_classification.md
new file mode 100644
index 0000000000000000000000000000000000000000..e311482aeccb061a703144e2c7325b90971d883b
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/token_classification.md
@@ -0,0 +1,550 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+	Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+	the License. You may obtain a copy of the License at
+	http://www.apache.org/licenses/LICENSE-2.0
+	Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+	an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+	specific language governing permissions and limitations under the License.
+	⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+	rendered properly in your Markdown viewer.
+	-->
+
+# تصنيف الرموز(Token classification)
+
+[[open-in-colab]]
+
+<Youtube id="wVHdVlPScxA"/>
+
+يهدف تصنيف الرموز إلى إعطاء تسمية لكل رمز على حدة في الجملة. من أكثر مهام تصنيف الرموز شيوعًا هو التعرف على الكيانات المسماة (NER). يحاول NER تحديد تسمية لكل كيان في الجملة، مثل شخص، أو مكان، أو منظمة. 
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [WNUT 17](https://huggingface.co/datasets/wnut_17) للكشف عن كيانات جديدة.
+2.  استخدام نموذجك المضبوط بدقة للاستدلال.
+
+<Tip>
+
+للاطلاع جميع البنى والنقاط المتوافقة مع هذه المهمة، نوصي بالرجوع من [صفحة المهمة](https://huggingface.co/tasks/token-classification).
+
+</Tip>
+
+قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate seqeval
+```
+
+نحن نشجعك على تسجيل الدخول إلى حساب HuggingFace الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما يُطلب منك، أدخل رمزك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات WNUT 17
+
+ابدأ بتحميل مجموعة بيانات WNUT 17 من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+يمثل كل رقم في `ner_tags` كياناً. حوّل الأرقام إلى أسماء التصنيفات لمعرفة ماهية الكيانات:
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+
+يشير الحرف الذي يسبق كل `ner_tag` إلى موضع الرمز للكيان:
+
+- `B-` يشير إلى بداية الكيان.
+- `I-` يشير إلى أن الرمز يقع ضمن نفس الكيان (على سبيل المثال، الرمز `State` هو جزء من كيان مثل `Empire State Building`).
+- `0` يشير إلى أن الرمز لا يمثل أي كيان.
+
+## المعالجة المسبقة(Preprocess)
+
+<Youtube id="iY2AZYdZAr0"/>
+
+الخطوة التالية هي تحميل مُجزِّئ النصوص DistilBERT للمعالجة المسبقة لحقل `tokens`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+كما رأيت في حقل `tokens` المثال أعلاه، يبدو أن المدخل قد تم تحليله بالفعل. لكن المدخل  لم يُجزأ بعد ويتعيّن عليك ضبط `is_split_into_words=True` لتقسيم الكلمات إلى كلمات فرعية. على سبيل المثال:
+
+```py
+>>> example = wnut["train"][0]
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+ومع ذلك، يضيف هذا بعض الرموز الخاصة `[CLS]` و`[SEP]` وتقسيم الكلمات إلى أجزاء يُنشئ عدم تطابق بين المُدخلات والتسميات. قد يتم تقسيم كلمة واحدة تقابل تسمية واحدة الآن إلى كلمتين فرعيتين. ستحتاج إلى إعادة محاذاة الرموز والتسميات عن طريق:
+
+1. ربط كل رمز بالكلمة الأصلية باستخدام الخاصية [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids).
+2. تعيين التسمية `-100` للرموز الخاصة `[CLS]` و`[SEP]` بحيث يتم تجاهلها بواسطة دالة الخسارة PyTorch (انظر [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)).
+3. تسمية الرمز الأول فقط لكلمة معينة. قم بتعيين `-100` لأجزاء الكلمة الأخرى.
+
+هنا كيف يمكنك إنشاء وظيفة لإعادة محاذاة الرموز والتسميات، وقص الجمل لتتجاوز الحد الأقصى لطول مُدخلات DistilBERT:
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # تعيين الرموز إلى كلماتهم المقابلة.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # تعيين الرموز الخاصة إلى -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # تسمية الرمز الأول فقط لكلمة معينة.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+لتطبيق هذه العملية على كامل مجموعة البيانات، استخدم الدالة [`~datasets.Dataset.map`] لمجموعة بيانات 🤗. يمكنك تسريع الدالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].من الأفضل استخدام *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بالكامل إلى الطول الأقصى.
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم(Evaluate)
+
+يُعدّ تضمين مقياس أثناء التدريب مفيدًا في تقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة مع مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل إطار [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) (انظر جولة 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس). يُخرج seqeval عدة نتائج: الدقة، والاستذكار، ومقياس F1، والدقة.
+
+```py
+>>> import evaluate
+
+>>> seqeval = evaluate.load("seqeval")
+```
+
+احصل على تسميات الكيانات المسماة (NER) أولاً،ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك الصحيحة إلى [`~evaluate.EvaluationModule.compute`] لحساب النتائج:
+
+```py
+>>> import numpy as np
+
+>>> labels = [label_list[i] for i in example[f"ner_tags"]]
+
+>>> def compute_metrics(p):
+...     predictions, labels = p
+...     predictions = np.argmax(predictions, axis=2)
+
+...     true_predictions = [
+...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+...     true_labels = [
+...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+
+...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
+...     return {
+...         "precision": results["overall_precision"],
+...         "recall": results["overall_recall"],
+...         "f1": results["overall_f1"],
+...         "accuracy": results["overall_accuracy"],
+...     }
+```
+
+دالة `compute_metrics` جاهزة للاستخدام، وستحتاج إليها عند إعداد التدريب.
+
+## التدريب(Train)
+
+قبل تدريب النموذج، جهّز خريطة تربط بين المعرّفات المتوقعة وتسمياتها باستخدام `id2label` و `label2id`:
+
+```py
+>>> id2label = {
+...     0: "O",
+...     1: "B-corporation",
+...     2: "I-corporation",
+...     3: "B-creative-work",
+...     4: "I-creative-work",
+...     5: "B-group",
+...     6: "I-group",
+...     7: "B-location",
+...     8: "I-location",
+...     9: "B-person",
+...     10: "I-person",
+...     11: "B-product",
+...     12: "I-product",
+... }
+>>> label2id = {
+...     "O": 0,
+...     "B-corporation": 1,
+...     "I-corporation": 2,
+...     "B-creative-work": 3,
+...     "I-creative-work": 4,
+...     "B-group": 5,
+...     "I-group": 6,
+...     "B-location": 7,
+...     "I-location": 8,
+...     "B-person": 9,
+...     "I-person": 10,
+...     "B-product": 11,
+...     "I-product": 12,
+... }
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForTokenClassification`] إلى جانب عدد التصنيفات المتوقعة، وخريطة التسميات:
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+في هذه المرحلة، هناك ثلاث خطوات فقط متبقية:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم درجات seqeval وحفظ تسخة التدريب.
+2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، ومجموعة البيانات، والمُجزِّئ اللغوي، و`data collator`، ودالة `compute_metrics`.
+3.استدعِ [`~Trainer.train`] لتدريب نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_wnut_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+للتعديل على نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+
+ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForTokenClassification`] إلى جانب عدد التسميات المتوقعة، وتخطيطات التسميات:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained(
+...     "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` مع [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_wnut["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_wnut["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+هيّئ النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers تتضمن دالة خسارة افتراضية مرتبطة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب درجات seqeval من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك والمحلل اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_wnut_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم جمّع callbacks الخاصة بك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز الآن لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع بيانات التدريب والتحقق، وعدد الحقبات، وcallbacks لتعديل النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تفصيلاً حول كيفية تعديل نموذج لتصنيف الرموز، ألق نظرة على الدفتر المقابل
+[دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
+أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+
+</Tip>
+
+## الاستدلال(Inference)
+
+رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!
+
+احصل على بعض النصوص التي تريد تشغيل الاستدلال عليها:
+
+```py
+>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
+```
+
+أبسط طريقة لتجربة نموذجك المُدرب مسبقًا للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتصنيف الكيانات المسماة مع نموذجك، ومرر نصك إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
+>>> classifier(text)
+[{'entity': 'B-location',
+  'score': 0.42658573,
+  'index': 2,
+  'word': 'golden',
+  'start': 4,
+  'end': 10},
+ {'entity': 'I-location',
+  'score': 0.35856336,
+  'index': 3,
+  'word': 'state',
+  'start': 11,
+  'end': 16},
+ {'entity': 'B-group',
+  'score': 0.3064001,
+  'index': 4,
+  'word': 'warriors',
+  'start': 17,
+  'end': 25},
+ {'entity': 'B-location',
+  'score': 0.65523505,
+  'index': 13,
+  'word': 'san',
+  'start': 80,
+  'end': 83},
+ {'entity': 'B-location',
+  'score': 0.4668663,
+  'index': 14,
+  'word': 'francisco',
+  'start': 84,
+  'end': 93}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قسّم النص إلى رموز وأرجع المُوتّرات بلغة PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+مرر مدخلاتك إلى النموذج واحصل على `logits`:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية:
+
+```py
+>>> predictions = torch.argmax(logits, dim=2)
+>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</pt>
+<tf>
+قسّم النص إلى رموز وأرجع المُوتّرات ب TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+مرر مدخلاتك إلى النموذج واحصل على `logits`:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> logits = model(**inputs).logits
+```
+
+استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية:
+
+```py
+>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/ar/tasks/translation.md b/docs/transformers/docs/source/ar/tasks/translation.md
new file mode 100644
index 0000000000000000000000000000000000000000..6245b903c22d63ef295cff91a98bb505ef6ee2ca
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks/translation.md
@@ -0,0 +1,407 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الترجمة(Translation)
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية.
+2. استخدام النموذج المضبوط بدقة للاستدلال.
+
+<Tip>
+
+لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation).
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات OPUS Books
+
+ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+ثم ألقِ نظرة على مثال:
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+`translation`: ترجمة إنجليزية وفرنسية للنص.
+
+## المعالجة المسبقة(Preprocess)
+
+<Youtube id="XAR8jnZZuUs"/>
+
+الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "google-t5/t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي:
+
+1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة.
+2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي.
+3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`.
+
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU:
+
+```py
+>>> import numpy as np
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+
+دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب.
+2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`.
+3. نفّذ [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True, #change to bf16=True for XPU
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب:
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع استدعاءاتك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+
+</Tip>
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه:
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه:
+
+```py
+>>> from transformers import pipeline
+
+# تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة.
+# أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية
+# يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages
+>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+</pt>
+<tf>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tasks_explained.md b/docs/transformers/docs/source/ar/tasks_explained.md
new file mode 100644
index 0000000000000000000000000000000000000000..b91297be7d2795d12b1f5ace4aa42fd89b5bb0db
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tasks_explained.md
@@ -0,0 +1,279 @@
+# كيف تُنجز نماذج 🤗 Transformers المهام؟
+
+في [ما الذي يمكن أن تفعله نماذج 🤗 Transformers](task_summary)، تعلمت عن معالجة اللغات الطبيعية (NLP)، والخطاب والصوت، ورؤية الحاسب، وبعض تطبيقاتها المهمة. ستلقي هذه الصفحة نظرة فاحصة على كيفية حل النماذج لهذه المهام وتوضيح ما يحدث ما يحدث  وراء الكواليس. هناك العديد من الطرق لحل مهمة معينة، وقد تنفذ بعض النماذج تقنيات معينة أو حتى تتناول المهمة من زاوية جديدة، ولكن بالنسبة لنماذج Transformer، فإن الفكرة العامة هي نفسها. وبفضل تصميمها المرن، فنظراً  لهيكلها المرن،  تُعدّ معظم النماذج عبارة عن  متغير من بنية  المُشفّر (Encoder)  أو  المُفكّك  (Decoder)  أو  المُشفّر - المُفكّك (Encoder-Decoder).  بالإضافة إلى نماذج Transformer، تحتوي مكتبتنا أيضًا على العديد من الشبكات العصبية التلافيفية (CNNs)، والتي لا تزال تستخدم حتى اليوم لمهام رؤية الحاسب. سنشرح أيضًا كيف تعمل شبكة عصبية تلافيفية CNN الحديثة.
+
+لشرح كيفية حل المهام، سنشرح ما يحدث داخل النموذج لإخراج تنبؤات مفيدة.
+
+- [Wav2Vec2](model_doc/wav2vec2) لتصنيف الصوت والتعرف التلقائي على الكلام (ASR)
+- [Vision Transformer (ViT)](model_doc/vit) و [ConvNeXT](model_doc/convnext) لتصنيف الصور
+- [DETR](model_doc/detr) للكشف عن الأجسام
+- [Mask2Former](model_doc/mask2former) لتجزئة الصورة
+- [GLPN](model_doc/glpn) لتقدير العمق
+- [BERT](model_doc/bert) لمهام NLP مثل تصنيف النصوص، وتصنيف الرموز، والإجابة على الأسئلة التي تستخدم مشفرًا
+- [GPT2](model_doc/gpt2) لمهام NLP مثل توليد النصوص التي تستخدم فك تشفير
+- [BART](model_doc/bart) لمهام NLP مثل الملخص والترجمة التي تستخدم ترميز-فك تشفير
+
+<Tip>
+
+قبل المتابعة، من الجيد أن يكون لديك بعض المعرفة الأساسية بهيكلية المحولات (Transformer Architecture) الأصلية. إن معرفة كيفية عمل المُشفّرات (Encoders) والمُفكّكات (Decoders) وآلية الانتباه (Attention Mechanism) سوف تساعدك في فهم كيفية عمل نماذج Transformer المختلفة. إذا كنت مبتدئًا أو بحاجة إلى مراجعة، فراجع [دورتنا](https://huggingface.co/course/chapter1/4؟fw=pt) لمزيد من المعلومات!
+
+</Tip>
+
+## الكلام والصوت (Speech and audio)
+
+يُعدّ  [Wav2Vec2](model_doc/wav2vec2)  نموذجًا مُدرَّبًا ذاتيًا (Self-Supervised)  على بيانات الكلام غير المُصنّفة،  ويُمكن  ضبطه  بدقة (Fine-tuning)  على بيانات موسومة  لأداء  مهام  تصنيف الصوت  والتعرف التلقائي على الكلام. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+يتكون هذا النموذج على أربعة مكونات رئيسية:
+
+1. *مشفّر الميزات (Feature Encoder)* يأخذ الموجة الصوتية الخام، ويقوم بتطبيعها (Normalization)  إلى متوسط صفري وانحراف معياري وحدوي، وتحويلها إلى تسلسل من متجهات الميزات التي يبلغ طول كل منها 20 مللي ثانية.
+
+2. *وحدة التكميم (Quantization Module):**  تتميز  أشكال الموجات الصوتية  بطبيعتها  المُستمرة،، لذلك لا يمكن تقسيمها إلى وحدات منفصلة كما  يمكن  تقسيم  التسلسل النصّي إلى كلمات ولهذا السبب يتم تمرير متجهات الميزات إلى *وحدة التكميم*، والتي تهدف إلى تعلم وحدات الكلام المنفصلة. يتم اختيار وحدة الكلام من مجموعة من الرموز، والمعروفة باسم *كتاب الرموز* (يمكنك اعتبار هذا بمثابة المفردات).  ومن  كتاب الرموز،يتم اختيار المتجه أو وحدة الكلام  التي تُمثّل  مدخل الصوت المُستمر  على أفضل وجه،  ويتم  تمريرها  عبر النموذج. 
+3. **شبكة السياق (Context Network):** يتم إخفاء حوالي نصف متجهات الميزات بشكل عشوائي، ويتم تغذية متجه الميزة المُقنّع إلى *شبكة السياق*، والتي تعد مُشفّر  محوّلات  (Transformer Encoder)   الذي يضيف أيضًا تضمينات موضعية نسبية (Relative Positional Embeddings)..
+
+4. **مهمة التناقضية:** يتمثل الهدف من التدريب المسبق لشبكة السياق هو *مهمة تناقضية*. يجب على النموذج التنبؤ بالتمثيل الصحيح للكلام المُكمّم للتنبؤ المقنع من مجموعة من التمثيلات الخاطئة، مما يشجع النموذج على ا إيجاد متجه السياق ووحدة الكلام المُكمّمة الأكثر تشابهًا (التصنيف المستهدف).
+
+بمجرد تدريب Wav2Vec2 مسبقًا، يمكنك ضبط دقته على بياناتك لتصنيف الصوت أو التعرف التلقائي على الكلام!
+
+### تصنيف الصوت (Audio classification)
+
+لاستخدام النموذج الذي تم تدريبه مسبقًا لتصنيف الصوت، أضف رأس تصنيف تسلسلي أعلى نموذج Wav2Vec2 الأساسي. رأس التصنيف هو طبقة خطية تستقبل الحالات المخفية للمشفر. تمثل الحالات المخفية الميزات التي تم تعلمها من كل إطار صوتي والذي يمكن أن يكون له أطوال مختلفة. لتحويلها إلى متجه واحد ثابت الطول، يتم تجميع الحالات المخفية أولاً ثم تحويلها إلى احتمالات عبر تصنيفات الفئات. يتم حساب التكلفة (الخسارة المتقاطعة) بين الاحتمالات  والتصنيف المستهدف للعثور على الفئة الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الصوت؟ تحقق من دليلنا الشامل [تصنيف الصوت](tasks/audio_classification) لمعرفة كيفية ضبط دقة نموذج Wav2Vec2 واستخدامه للاستدلال!
+
+### التعرف التلقائي على الكلام (Automatic speech recognition - ASR)
+
+لاستخدام النموذج الذي تم تدريبه مسبقًا للتعرف التلقائي على الكلام، أضف رأس نمذجة لغوية أعلى نموذج Wav2Vec2 الأساسي لـ [[التصنيف الزمني الترابطي (CTC)](glossary#connectionist-temporal-classification-ctc). رأس  النمذجة اللغوية عبارة عن طبقة خطية تقبل الحالات المخفية للمُشفّر وتحويلها إلى احتمالات. يمثل كل احتمال فئة رمزية (يأتي عدد الرموز من مفردات المهمة). يتم حساب تكلفة CTC بين الاحتمالات والأهداف للعثور على تسلسل الرموز الأكثر احتمالًا، والتي يتم فك تشفيرها بعد ذلك إلى  نص مكتوب.
+
+هل أنت مستعد لتجربة التعرف التلقائي على الكلام؟ تحقق من دليلنا الشامل [التعرف التلقائي على الكلام](tasks/asr) لمعرفة كيفية ضبط دقة نموذج Wav2Vec2 واستخدامه للاستدلال!
+
+## رؤية الحاسب (Computer vision)
+
+هناك طريقتان لتناول مهام رؤية الحاسب:
+
+1. قم بتقسيم الصورة إلى تسلسل من الرقع ومعالجتها بالتوازي باستخدام مُحوّل Transformer.
+2. استخدم شبكة عصبية تلافيفية CNN) حديثة، مثل [ConvNeXT](model_doc/convnext)، والتي تعتمد على الطبقات التلافيفية ولكنها تعتمد تصميمات حديثة للشبكات.
+
+<Tip>
+
+يقوم النهج الثالث بمزج المحولات مع التلافيف (على سبيل المثال، [Convolutional Vision Transformer](model_doc/cvt) أو [LeViT](model_doc/levit)). لن نناقشها لأنها تجمع ببساطة بين النهجين اللذين نستعرضهما هنا.
+
+</Tip>
+
+يتم استخدام ViT و ConvNeXT بشكل شائع لتصنيف الصور، ولكن بالنسبة لمهام الرؤية الأخرى مثل اكتشاف الكائنات والتجزئة وتقدير العمق، سنلقي نظرة على DETR و Mask2Former و GLPN، على التوالي؛ فهذه النماذج هي الأنسب لتلك المهام.
+
+### تصنيف الصور (Image classification)
+
+يمكن استخدام كل من ViT و ConvNeXT لتصنيف الصور؛ الاختلاف الرئيسي هو أن ViT يستخدم آلية انتباه بينما يستخدم ConvNeXT الالتفافات.
+
+#### المحول Transformer
+
+[ViT](model_doc/vit) يستبدل التلافيف تمامًا بهندسة Transformer نقية. إذا كنت على دراية بـ Transformer الأصلي، فأنت بالفعل في طريقك إلى فهم ViT.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+كان التغيير الرئيسي الذي قدمه ViT هو كيفية تغذية الصور إلى Transformer:
+
+1. يتم تقسيم الصورة إلى رقع مربعة غير متداخلة، يتم تحويل كل منها إلى متجه أو يُسمى *تمثيل الرقعة*. يتم إنشاء تضمينات الرقع من طبقة تلافيفية ثنائية الأبعاد 2D والتي تقوم بإنشاء أبعاد الإدخال الصحيحة (والتي بالنسبة إلى Transformer الأساسي هي 768 قيمة لكل تضمين رقعة). إذا كان لديك صورة 224x224 بكسل، فيمكنك تقسيمها إلى 196 رقعة صورة 16x16. تمامًا مثل كيفية تجزئة النص إلى كلمات، يتم "تجزئة" الصورة إلى سلسلة من الرقع.
+
+2. يتم إضافة *رمز قابل للتعلم* -  تتم إضافة رمز خاص `[CLS]` - إلى بداية تمثيلات الرقع تمامًا مثل BERT. يتم استخدام الحالة المخفية النهائية للرمز `[CLS]` كمدخل لرأس التصنيف المُرفق؛ يتم تجاهل  المخرجات الأخرى. تساعد هذه الرموز النموذج على تعلم كيفية ترميز تمثيل الصورة.
+
+3. الشيء الأخير تتم إضافة "تمثيلات تموضع" إلى تمثيلات الرقع والرمز القابل للتعلم لأن النموذج لا يعرف كيفية ترتيب رقع الصورة. تكون  تمثيلات التموضع قابلة للتعلم أيضًا ولها نفس حجم تمثيلات الرقع. وأخيرًا، يتم تمرير جميع التمثيلات إلى مُشفّر Transformer.
+
+4. يتم تمرير الإخراج، وتحديدًا مخرج الرمز `[CLS]`، إلى رأس الإدراك المتعدد الطبقات (MLP). الهدف من التدريب المسبق لـ ViT هو التصنيف فقط. يقوم رأس MLP، مثل رؤوس التصنيف الأخرى، يحول رأس MLP المخرجات إلى احتمالات عبر تصنيفات الفئات ويحسب دالة التكلفة (الخسارة المتقاطعة) للعثور على الفئة الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الصور؟ تحقق من دليلنا الشامل [تصنيف الصور](tasks/image_classification) لمعرفة كيفية ضبط دقة نموذج ViT واستخدامه للاستدلال!
+
+#### الشبكات العصبية التلافيفية (CNN)
+
+<Tip>
+
+يشرح هذا القسم بإيجاز الالتفافات، ولكن سيكون من المفيد أن يكون لديك فهم مسبق لكيفية تغيير شكل الصورة وحجمها. إذا كنت غير معتاد على الالتفافات، تحقق من [فصل الشبكات العصبية التلافيفية](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) من كتاب fastai!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext) هو بنية CNN تعتمد تصاميم الشبكات الجديدة والحديثة لتحسين الأداء. ومع ذلك، لا تزال الالتفافات هي جوهر النموذج. من منظور عام، [الالتفاف](glossary#convolution) هو عملية حيث يتم ضرب مصفوفة أصغر (*نواة*) بمقطع صغير من وحدات بكسل الصورة. يحسب بعض الميزات منه، مثل نسيج معين أو انحناء خط. ثم ينزلق إلى النافذة التالية من البكسلات؛ المسافة التي تقطعها الالتفاف تسمى *الخطوة*. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>عملية التفاف أساسية بدون حشو أو خطو خطوة واسعة، مأخوذة من  <a href="https://arxiv.org/abs/1603.07285">دليل لحساب الالتفاف للتعلم العميق.</a></small>
+
+يمكنك تغذية هذا الناتج إلى طبقة التفاف أخرى،  ومع كل طبقة متتالية، تتعلم الشبكة أشياء أكثر تعقيدًا وتجريدية مثل النقانق أو الصواريخ. بين طبقات الالتفاف، من الشائع إضافة طبقة تجميع لتقليل الأبعاد وجعل النموذج أكثر قوة للتغيرات في موضع الميزة.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+يقوم ConvNeXT بتحديث شبكة CNN بطرق خمس:
+
+1. تغيير عدد الكتل في كل مرحلة و"ترقيع" الصورة باستخدام خطوة أكبر وحجم نواة المقابل. تجعل استراتيجية التجزئة غير المتداخلة استراتيجية الترقيع مشابهة للطريقة التي يقسم بها ViT للصورة إلى رقع.
+
+2. تقلص طبقة *العنق الزجاجي* عدد القنوات ثم تعيدها لأنها أسرع في إجراء التفاف 1x1، ويمكنك زيادة العمق. يقوم عنق الزجاجة المقلوب بالعكس عن طريق توسيع عدد القنوات وتقلصها، وهو أكثر كفاءة من حيث الذاكرة.
+
+3. استبدل طبقة الالتفاف النموذجية 3x3 في طبقة عنق الزجاجة بـ *الالتفاف بالعمق*، والذي يطبق الالتفاف على كل قناة إدخال بشكل منفصل ثم يقوم بتكديسها معًا مرة أخرى في النهاية. هذا يوسع عرض الشبكة لتحسين الأداء.
+
+4. لدى ViT مجال استقبال عالمي مما يعني أنه يمكنه رؤية المزيد من الصورة في وقت واحد بفضل آلية الانتباه الخاصة به. تحاول ConvNeXT محاكاة هذا التأثير عن طريق زيادة حجم النواة إلى 7x7.
+
+5. يقوم ConvNeXT أيضًا بإجراء العديد من تغييرات تصميم الطبقة التي تُحاكي نماذج المحولات. هناك عدد أقل من طبقات التنشيط والطبقات التطبيع، يتم تبديل دالة التنشيط إلى GELU بدلاً من ReLU، ويستخدم LayerNorm بدلاً من BatchNorm.
+
+يتم تمرير الإخراج من كتل الالتفاف إلى رأس تصنيف يحول المخرجات إلى احتمالات ويحسب دالة التكلفة (الخسارة المتقاطعة) للعثور على التصنيف الأكثر احتمالاً.
+
+### اكتشاف الكائنات (Object detection)
+
+[DETR](model_doc/detr)، *DEtection TRansformer*، هو نموذج اكتشاف كائنات من البداية إلى النهاية يجمع بين CNN مع محول المشفر-فك التشفير.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. يأخذ العمود الفقري CNN *المدرب مسبقًا* صورة، ممثلة بقيم بكسلاتها، وينشئ خريطة ميزات منخفضة الدقة لها. يتم تطبيق التفاف 1x1 على خريطة الميزات لتقليل الأبعاد، و إنشاء خريطة ميزات جديدة بتمثيل صورة عالي المستوى. نظرًا لأن المحول (Transformer) هو نموذج تسلسلي، يتم تسوية خريطة الميزات إلى تسلسل من متجهات الميزات التي يتم دمجها مع تمثيلات التموضع.
+
+2. يتم تمرير متجهات الميزات إلى المشفر، والذي يتعلم تمثيلات الصورة باستخدام طبقات الانتباه الخاصة به. بعد ذلك، يتم دمج الحالات المخفية للمُشفّر مع *استعلامات الكائنات* في فك التشفير. استعلامات الكائنات هي تمثيلات مكتسبة تركز على مناطق مختلفة من الصورة، ويتم تحديثها أثناء مرورها عبر كل طبقة انتباه. يتم تمرير  الحالات المخفية لفك التشفير إلى شبكة تغذية أمامية التي تتنبأ بإحداثيات مربعات الإحاطة وتصنيف العلامة لكل استعلام كائن، أو `بدون كائن` إذا لم يكن هناك أي كائن.
+
+   يقوم DETR بفك تشفير كل استعلام كائن بالتوازي لإخراج  *N*  من التنبؤات النهائية، حيث  *N*  هو عدد الاستعلامات. على عكس النموذج التلقائي الذي يتنبأ بعنصر واحد في كل مرة، فإن "اكتشاف الكائنات" هو مهمة تنبؤ بمجموعة من التنبؤات (مثل `مربع إحاطة`، `تصنيف`) تقوم بإجراء  *N*  من التنبؤات في مرور واحدة.
+
+3. يستخدم DETR  دالة *خسارة المطابقة ثنائية الفئات* أثناء التدريب لمقارنة عدد ثابت من التنبؤات بمجموعة ثابتة من تصنيفات البيانات الحقيقية. إذا كان هناك عدد أقل من تصنيفات البيانات الحقيقية في مجموعة  *N*  من التصنيفات، فيتم حشوها بفئة "بدون كائن". تشجع دالة الخسارة هذه DETR على العثور على تعيين واحد لواحد بين التنبؤات وتصنيفات البيانات الحقيقية. إذا لم تكن مربعات الإحاطة أو  تصنيفات الفئات صحيحة، يتم تكبد خسارة. وبالمثل، إذا تنبأ DETR بكائن غير موجود، فإنه يتم معاقبته. وهذا يشجع DETR على العثور على كائنات أخرى في الصورة بدلاً من التركيز على كائن بارز حقًا.
+
+يتم إضافة رأس اكتشاف كائن أعلى DETR للعثور على تصنيف الكائن وإحداثيات مربع الإحاطة. هناك مكونان لرأس اكتشاف الكائنات: طبقة خطية لتحويل حالات فك التشفير المخفية إلى احتمالات عبر تصنيفات الفئات، وشبكةMLP للتنبؤ بمربع الإحاطة.
+
+هل أنت مستعد لتجربة اكتشاف الكائنات؟ تحقق من دليلنا الشامل [دليل اكتشاف الكائنات](tasks/object_detection) لمعرفة كيفية ضبط نموذج DETR واستخدامه للاستدلال!
+
+### تجزئة الصورة (Image segmentation)
+
+يُعد [Mask2Former](model_doc/mask2former) بنيةً شاملةً لحل جميع أنواع مهام تجزئة الصور. عادةً ما تُصمم نماذج التجزئة التقليدية لمهمة فرعية محددة من مهام تجزئة الصور، مثل تجزئة المثيل أو التجزئة الدلالية أو التجزئة الشاملة. يصوغ Mask2Former كل مهمة من تلك المهام على أنها مشكلة *تصنيف الأقنعة*. يقوم تصنيف القناع بتجميع وحدات البكسل في *N* قطعة، ويتنبأ بـ *N* أقنعة وتصنيف الفئة المقابل لها لصورة معينة. سنشرح في هذا القسم كيفية عمل Mask2Former، ويمكنك بعد ذلك تجربة ضبط SegFormer في النهاية.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+هناك ثلاثة مكونات رئيسية لـ Mask2Former:
+
+1. العمود الفقري [Swin](model_doc/swin) يقبل صورة وينشئ خريطة ميزات منخفضة الدقة من 3 عمليات التفافات متتالية 3x3.
+
+2. يتم تمرير خريطة الميزات إلى *فك تشفير البكسل* الذي يقوم تدريجياً بزيادة   الميزات منخفضة الدقة إلى تمثيلات عالية الدقة لكل بكسل. في الواقع، يقوم فك تشفير البكسل بإنشاء ميزات متعددة المقاييس (تحتوي على كل من الميزات منخفضة وعالية الدقة) بدقة 1/32 و1/16 و1/8 من الصورة الأصلية.
+
+3. يتم تغذية كل من خرائط الميزات ذات المقاييس المختلفة على التوالي إلى طبقة واحدة من طبقات فك التشفير في كل مرة لالتقاط الأجسام الصغيرة من ميزات الدقة العالية. يتمثل مفتاح Mask2Former آلية *الاهتمام المقنع* في فك التشفير. على عكس الانتباه المتقاطع الذي يمكن أن يركز على الصورة بأكملها، يركز الانتباه المقنع فقط على منطقة معينة من الصورة. هذا أسرع ويؤدي إلى أداء أفضل لأن الميزات المحلية لصورة كافية للنموذج للتعلم منها.
+
+4. مثل [DETR](tasks_explained#object-detection)، يستخدم Mask2Former أيضًا استعلامات كائن مكتسبة ويجمعها مع ميزات الصورة من فك تشفير البكسل لإجراء تنبؤ مجموعة  (`تصنيف الفئة`، `التنبؤ بالقناع`). يتم تمرير حالات فك التشفير المخفية إلى طبقة خطية وتحويلها إلى احتمالات عبر علامات التصنيف. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين الاحتمالات وتصنيف الفئة لتحديد الأكثر احتمالاً.
+
+    يتم إنشاء تنبؤات الأقنعة عن طريق الجمع بين تمثيلات البكسل وحالات فك التشفير المخفية النهائية. يتم حساب دالة الخسارة المتقاطعة سيجمويد وخسارة النرد بين الاحتمالات والقناع البيانات الحقيقية للعثور على القناع الأكثر احتمالاً.
+
+هل أنت مستعد لتجربة يدك في اكتشاف الكائنات؟ تحقق من دليلنا الشامل [دليل تجزئة الصورة](tasks/semantic_segmentation) لمعرفة كيفية ضبط SegFormer واستخدامه للاستدلال!
+
+### تقدير العمق (Depth estimation)
+
+[GLPN](model_doc/glpn)، شبكة المسار العالمية المحلية، هي محول ل تقدير العمق الذي يجمع بين مشفر [SegFormer](model_doc/segformer) مع فك تشفير خفيف الوزن.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. مثل ViT، يتم تقسيم الصورة إلى تسلسل من الرقع، باستثناء أن هذه رقع الصورة أصغر. هذا أفضل لمهام التنبؤ الكثيفة مثل التجزئة أو تقدير العمق. يتم تحويل رقع الصورة إلى تمثيلات للرقع (راجع قسم [تصنيف الصور](#image-classification) لمزيد من التفاصيل حول كيفية إنشاء تمثيلات الرقع)، والتي يتم تغذيتها إلى المشفر.
+
+2. يقبل المشفر تمثيلات الرقع، ويمررها عبر عدة كتل مشفرة. يتكون كل كتلة من طبقات انتباه وMix-FFN. الغرض من هذا الأخير هو توفير معلومات موضعية. في نهاية كل كتلة مشفرة توجد طبقة *دمج الرقع* لإنشاء تمثيلات هرمية. يتم دمج ميزات كل مجموعة من الرقع المجاورة، ويتم تطبيق طبقة خطية على الميزات المدمجة لتقليل عدد الرقع إلى دقة 1/4. يصبح هذا المُدخل للكتلة المشفرة التالية، حيث تتكرر هذه العملية بأكملها حتى تحصل على ميزات الصورة بدقة 1/8 و1/16 و1/32.
+
+3. يقوم فك تشفير خفيف الوزن بأخذ خريطة الميزات الأخيرة (مقياس 1/32) من المشفر وزيادة حجمها إلى مقياس 1/16. من هنا، يتم تمرير الميزة إلى وحدة *دمج الميزات الانتقائية (SFF)*، والتي تقوم باختيار ودمج الميزات المحلية والعالمية من خريطة انتباه لكل ميزة ثم زيادة حجمها إلى 1/8. تتم إعادة هذه العملية حتى تصبح الميزات فك التشفير بنفس حجم الصورة الأصلية. يتم تمرير الإخراج عبر طبقتين تلافيفتين ثم يتم تطبيق تنشيط سيجمويد للتنبؤ بعمق كل بكسل.
+
+## معالجة اللغات الطبيعية (Natural language processing -NLP)
+
+تم تصميم نموذج المحول Transformer في الأصل للترجمة الآلية، ومنذ ذلك الحين أصبح في الواقع البنية الافتراضية لحل جميع مهام NLP. تناسب بعض المهام بنية المشفر في نموذج المحول، في حين أن البعض الآخر أكثر ملاءمة لفك التشفير. لا تزال مهام أخرى تستخدم بنية المشفر-فك التشفير في نموذج المحول.
+
+### تصنيف النصوص (Text classification)
+
+يعد [BERT](model_doc/bert)  نموذج يعتمد على المُشفّر فقط، وهو أول نموذج يُطبق بشكل فعال ثنائية الاتجاه العميقة لتعلم تمثيلات أكثر ثراءً للنص من خلال الانتباه إلى الكلمات على كلا الجانبين.
+
+1. يستخدم BERT تجزئة [WordPiece](tokenizer_summary#wordpiece) لإنشاء تمثيل رمزي للنص. للتمييز بين جملة واحدة وزوج من الجمل، تتم إضافة رمز خاص `[SEP]` للتفريق بينهما. تتم إضافة رمز خاص `[CLS]` إلى بداية كل تسلسل نصي. ويتم استخدام الإخراج النهائي مع الرمز `[CLS]` كمدخل لرأس التصنيف لمهام التصنيف. كما يضيف BERT تضمينًا للمقطع  للإشارة إلى ما إذا كان الرمز ينتمي إلى الجملة الأولى أو الثانية في زوج من الجمل.
+
+2. يتم تدريب BERT المسبق باستخدام هدفين: نمذجة اللغة المقنعة وتنبؤ الجملة التالية. في نمذجة اللغة المقنعة، يتم إخفاء نسبة مئوية مُعيّنة من رموز الإدخال بشكل عشوائي، ويجب على النموذج التنبؤ بها. يحل هذا مشكلة ثنائية الاتجاه، حيث يمكن للنموذج أن يغش ويرى جميع الكلمات و"يتنبأ" بالكلمة التالية. تتم تمرير الحالات المخفية النهائية للرموز المقنعة المتوقعة إلى شبكة تغذية أمامية مع دالة Softmax عبر مفردات اللغة للتنبؤ بالكلمة المقنعة.
+
+    الهدف الثاني من التدريب المسبق هو توقع الجملة التالية. يجب على النموذج التنبؤ بما إذا كانت الجملة "ب" تتبع الجملة"أ". نصف الوقت تكون الجملة  "ب" هي الجملة التالية، والنصف الآخر من الوقت، تكون الجملة  "ب" عبارة عشوائية. يتم تمرير التنبؤ، سواء كانت الجملة التالية أم لا، إلى شبكة تغذية أمامية مع دالة Softmax عبر الفئتين (`IsNext` و`NotNext`).
+
+3. يتم تمرير تمثيلات الإدخال عبر عدة طبقات مشفرة لإخراج بعض الحالات المخفية النهائية.
+
+لاستخدام النموذج المسبق التدريب لتصنيف النصوص، أضف رأس تصنيف تسلسلي أعلى نموذج BERT الأساسي. رأس تصنيف التسلسلي هو طبقة خطية تقبل الحالات المخفية النهائية وتجري تحويلًا خطيًا لتحويلها إلى احتمالات logits. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين logits والهدف للعثور على التصنيف الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف النصوص؟ تحقق من [دليل تصنيف النصوص](tasks/sequence_classification)  الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه للاستنتاج!
+
+### تصنيف الرموز (Token classification)
+
+لاستخدام BERT لمهام تصنيف الرموز مثل التعرف على الكيانات المسماة (NER)، أضف رأس تصنيف الرموز أعلى نموذج BERT الأساسي. رأس تصنيف الرموز هو طبقة خطية تقبل الحالات المخفية النهائية  وتجري تحويلًا خطيًا لتحويلها إلى logits. يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين logits وكل رمز للعثور على التصنيف الأكثر احتمالًا.
+
+هل أنت مستعد لتجربة تصنيف الرموز؟ تحقق من  [دليل تصنيف الرموز](tasks/token_classification) الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه للاستنتاج!
+
+### الإجابة على الأسئلة (Question answering)
+
+لاستخدام BERT للإجابة على الأسئلة، أضف رأس تصنيف المدى أعلى نموذج BERT الأساسي. تقبل هذه الطبقة الخطية الحالات المخفية النهائية وتُجري تحويلًا خطيًا  لحساب داية ونهاية `الامتداد`  logits `span` البداية والنهاية المقابلة للإجابة. يتم حسابدالة التكلفة (الخسارة المتقاطعة) بين logits وموقع التصنيف للعثور على الامتداد الأكثر احتمالًا من النص المقابل للإجابة.
+
+هل أنت مستعد لتجربة الإجابة على الأسئلة؟ راجع [دليل الإجابة على الأسئلة](tasks/question_answering) الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilBERT واستخدامه في الاستدلال!
+
+
+<Tip>
+
+💡 لاحظ مدى سهولة استخدام BERT لمهام مختلفة بمجرد تدريبه مسبقًا. كل ما تحتاج إليه هو إضافة رأس محدد إلى النموذج المسبق التدريب للتلاعب بالحالات المخفية إلى الإخراج المطلوب!
+
+</Tip>
+
+### توليد النصوص (Text generation)
+
+يُعد [GPT-2](model_doc/gpt2) نموذجًا قائم على فك التشفير فقط تم تدريبه المسبق على كمية كبيرة من النصوص. يمكنه توليد نص مقنع (على الرغم من أنه ليس دائمًا صحيحًا!) بناءً على مُحفّز معين واستكمال مهام NLP الأخرى مثل الإجابة على الأسئلة على الرغم من أنه لم يتم تدريبه بشكل صريح على ذلك.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. يستخدم GPT-2 [ترميز الأزواج البايتية (BPE)](tokenizer_summary#byte-pair-encoding-bpe) لتجزئة الكلمات وإنشاء  تمثيل رمزى. يتم إضافة تمثيلات موضعية إلى تمثيلات الرموز للإشارة إلى موضع كل رمز في التسلسل. يتم تمرير تمثيلات الإدخال عبر عدة كتل فك تشفير لإخراج بعض الحالات المخفية النهائية. داخل كل كتلة فك تشفير، يستخدم GPT-2 طبقة *انتباه ذاتي مقنع* مما يعني أن GPT-2 لا يمكنه الانتباه بالرموز المستقبلية. يُسمح له فقط بالاهتمام بالرموز الموجودة على اليسار. يختلف هذا عن رمز [`mask`] الخاص بـ BERT لأنه، في الانتباه الذاتي المقنع، يتم استخدام قناع انتباه لتعيين النتيجة إلى `0` للرموز المستقبلية.
+
+2. يتم تمرير الإخراج من فك التشفير إلى رأس نمذجة اللغة، والتي تُجري  تحويلًا  خطيًا  لتحويل الحالات المخفية إلى احتمالات logits. التصنيف هو الرمز التالي في التسلسل، والذي يتم إنشاؤه عن طريق  تغيير موضع logits إلى اليمين بمقدار واحد. يتم حساب دالة  التكلفة (الخسارة  المتقاطعة)  بين logits  التي تم تغيير موضعها والتصنيفات لإخراج الرمز التالي الأكثر احتمالًا.
+
+يستند هدف التدريب المسبق لـ GPT-2 بالكامل إلى  [نمذجة اللغة السببية](glossary#causal-language-modeling)، والتنبؤ بالكلمة التالية في تسلسل. يجعل هذا GPT-2 جيدًا بشكل خاص في المهام التي تتضمن توليد النص.
+
+هل أنت مستعد لتجربة توليد النصوص؟ تحقق من دليل [دليل  نمذجة  اللغة  السببية](tasks/language_modeling#causal- الشامل الخاص بنا لمعرفة كيفية ضبط نموذج DistilGPT-2 واستخدامه للاستنتاج!
+
+<Tip>
+
+للحصول على مزيد من المعلومات حول توليد النص، راجع دليل [استراتيجيات توليد النصوص](generation_strategies)!
+
+</Tip>
+
+### التلخيص (Summarization)
+
+تم تصميم نماذج المشفر-فك التشفير مثل [BART](model_doc/bart) و [T5](model_doc/t5) لنمط تسلسل إلى تسلسل لمهمة التلخيص. سنشرح كيف يعمل BART في هذا القسم، ثم يمكنك تجربة ضبط T5 في النهاية.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. تتشابه بنية المشفر BART كثيرًا مع BERT وتقبل رمزًا وتمثيلًا موضعيًا للنص. يتم تدريب BART مسبقًا عن طريق إتلاف المُدخلات ثم إعادة بنائه باستخدام فك التشفير. على عكس المشفرات الأخرى ذات استراتيجيات الإتلاف المحددة، يمكن لـ BART تطبيق أي نوع من الإتلاف. ومع ذلك، فإن استراتيجية إتلاف "ملء النص" تعمل بشكل أفضل. في ملء النص، يتم استبدال عدد من امتدادات النص برمز **واحد** [`mask`]. هذا أمر مهم لأن النموذج يجب أن يتنبأ بالرموز المقنعة، ويعلّم النموذج التنبؤ بعدد الرموز المفقودة. يتم تمرير تمثيلات الإدخال والامتدادات المقنعة عبر المشفر لإخراج بعض الحالات المخفية النهائية، ولكن على عكس BERT، لا يضيف BART شبكة تغذية أمامية نهائية في النهاية للتنبؤ بكلمة.
+
+2. يتم تمرير إخراج المشفر إلى فك التشفير، والذي يجب أن يتنبأ بالرموز المقنعة وأي رموز غير تالفة من ناتج المشفر. يمنح هذا فك التشفير سياقًا إضافيًا للمساعدة في استعادة النص الأصلي. يتم تمرير ناتج فك التشفير إلى رأس نمذجة اللغوية، والذي يجري تحويلًا خطيًا لتحويل الحالات المخفية إلى احتمالات(logits). يتم حساب دالة التكلفة (الخسارة المتقاطعة) بين الاحتمالات logits والتصنيف، وهو مجرد الرمز الذي تم تغيير موضعه إلى اليمين.
+
+هل أنت مستعد لتجربة التلخيص؟ تحقق من دليل التلخيص الشامل الخاص بنا لمعرفة كيفية ضبط نموذج T5 واستخدامه للاستنتاج!
+
+<Tip>
+
+للحصول على مزيد من المعلومات حول توليد النص، راجع دليل استراتيجيات توليد النص!
+
+</Tip>
+
+### الترجمة (Translation)
+
+تُعد الترجمة مثالًا آخر على مهام التسلسل إلى التسلسل، مما يعني أنه يمكنك استخدام نموذج المشفر-فك التشفير مثل [BART](model_doc/bart) أو [T5](model_doc/t5) للقيام بذلك. سنشرح كيف يعمل BART في هذا القسم، ثم يمكنك تجربة ضبط T5 في النهاية.
+
+يتكيف BART مع الترجمة عن طريق إضافة مشفر منفصل يتم تهيئته بشكل عشوائي لتعيين لغة المصدر بمدخلات يمكن فك تشفيرها إلى لغة الهدف. يتم تمرير تمثيلات هذا المشفر الجديد إلى المشفر المسبق التدريب بدلاً من تمثيلات الكلمات الأصلية. يتم تدريب مشفر المصدر عن طريق تحديث مشفر المصدر وتمثيلات التموضع وتمثيلات الإدخال باستخدام دالة التكلفة (الخسارة المتقاطعة) من ناتج النموذج. يتم تجميد معلمات النموذج في هذه الخطوة الأولى، ويتم تدريب جميع معلمات النموذج معًا في الخطوة الثانية.
+
+تم إصدار نسخة متعددة اللغات من BART، تسمى mBART، مُخصصة للترجمة ومُدرّبة مسبقًا على العديد من اللغات المختلفة.
+
+هل أنت مستعد لتجربة الترجمة؟ تحقق من دليل الترجمة الشامل الخاص بنا لمعرفة كيفية ضبط نموذج T5 واستخدامه للاستنتاج!
+
+<Tip>
+
+ **للحصول على مزيد من المعلومات حول توليد النصوص، راجع دليل [استراتيجيات توليد النصوص](generation_strategies)!** 
+ 
+</Tip>
diff --git a/docs/transformers/docs/source/ar/tflite.md b/docs/transformers/docs/source/ar/tflite.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e75c7a10a3c66d0eec7b87246269f632b057d2e
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tflite.md
@@ -0,0 +1,40 @@
+# التصدير إلى TFLite
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) هو إطار عمل خفيف الوزن لنشر نماذج التعلم الآلي على الأجهزة المحدودة الموارد، مثل الهواتف المحمولة، والأنظمة المدمجة، وأجهزة إنترنت الأشياء (IoT). تم تصميم TFLite لتشغيل النماذج وتحسينها بكفاءة على هذه الأجهزة ذات الطاقة الحاسوبية والذاكرة واستهلاك الطاقة المحدودة.
+
+يُمثَّل نموذج TensorFlow Lite بتنسيق محمول فعال خاص يُعرَّف بامتداد الملف `.tflite`.
+
+🤗 Optimum يقدم وظيفة لتصدير نماذج 🤗 Transformers إلى TFLite من خلال الوحدة النمطية `exporters.tflite`. بالنسبة لقائمة هندسات النماذج المدعومة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/tflite/overview).
+
+لتصدير نموذج إلى TFLite، قم بتثبيت متطلبات البرنامج المطلوبة:
+
+```bash
+pip install optimum[exporters-tf]
+```
+
+للاطلاع على جميع المغامﻻت المتاحة، راجع [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model)، أو عرض المساعدة في سطر الأوامر:
+
+```bash
+optimum-cli export tflite --help
+```
+
+لتصدير نسخة النموذج ل 🤗 Hub، على سبيل المثال، `google-bert/bert-base-uncased`، قم بتشغيل الأمر التالي:
+
+```bash
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+ستظهر لك السجلات  التي تُبيّن التقدم وموقع حفظ ملف  `model.tflite` الناتج، كما في المثال التالي:
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+```
+
+يُبيّن المثال أعلاه كيفية تصدير نسخة من النموذج ل 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج المجزء اللغوى في نفس المسار (`local_path`). عند استخدام CLI، قم بتمرير `local_path` إلى معامل `model` بدلاً من اسم النسخة على 🤗 Hub.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/tiktoken.md b/docs/transformers/docs/source/ar/tiktoken.md
new file mode 100644
index 0000000000000000000000000000000000000000..6f3755d8670cdc75885662477f018ab8bfd4b0a4
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tiktoken.md
@@ -0,0 +1,41 @@
+# Tiktoken والتفاعل مع Transformers
+
+يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج
+`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast).
+
+### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`:
+	- gpt2
+	- llama3
+
+## مثال على الاستخدام
+
+من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي
+يمكن تحميله من نفس الملف بالضبط:
+
+```py
+from transformers import AutoTokenizer
+
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original")
+```
+## إنشاء مجزىء لغوي tiktoken
+
+لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`].
+
+قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`].
+
+```py
+
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
diff --git a/docs/transformers/docs/source/ar/tokenizer_summary.md b/docs/transformers/docs/source/ar/tokenizer_summary.md
new file mode 100644
index 0000000000000000000000000000000000000000..109521d24bbb76de408a6462a868731481c1d924
--- /dev/null
+++ b/docs/transformers/docs/source/ar/tokenizer_summary.md
@@ -0,0 +1,198 @@
+# ملخص عن المجزئات اللغوية
+
+[[open-in-colab]]
+
+في هذه الصفحة، سنتناول بالتفصيل عملية التجزئة.
+
+<Youtube id="VFp38yj8h3A"/>
+
+كما رأينا في [برنامج تعليمي حول المعالجة المسبقة](preprocessing)، فإن تجزئة النص يقسمه إلى كلمات أو
+الرموز الفرعية (كلمات جزئية)، والتي يتم بعد ذلك تحويلها إلى معرفات من خلال قائمة بحث. يعد تحويل الكلمات أو الرموز الفرعية إلى معرفات مباشرًا، لذا في هذا الملخص، سنركز على تقسيم النص إلى كلمات أو رموز فرعية (أي تجزئة النص).
+وبشكل أكثر تحديدًا، سنلقي نظرة على الأنواع الثلاثة الرئيسية من المُجزئات اللغوية المستخدمة في 🤗 المحولات: [ترميز الأزواج البايتية (BPE)](#byte-pair-encoding)، [WordPiece](#wordpiece)، و [SentencePiece](#sentencepiece)، ونعرض أمثلة
+على نوع المُجزئة الذي يستخدمه كل نموذج.
+
+لاحظ أنه في كل صفحة نموذج، يمكنك الاطلاع على وثائق المُجزئة المرتبط لمعرفة نوع المُجزئ
+الذي استخدمه النموذج المُدرب مسبقًا. على سبيل المثال، إذا نظرنا إلى [`BertTokenizer`]، يمكننا أن نرى أن النموذج يستخدم [WordPiece](#wordpiece).
+
+## مقدمة
+
+إن تقسيم النص إلى أجزاء أصغر هو مهمة أصعب مما تبدو، وهناك طرق متعددة للقيام بذلك.
+على سبيل المثال، دعنا نلقي نظرة على الجملة `"Don't you love 🤗 Transformers? We sure do."`
+
+<Youtube id="nhJxYji1aho"/>
+
+يمكن تقسيم  هذه الجملة ببساطة عن طريق  المسافات، مما  سينتج عنه ما يلي:```
+
+```
+["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+```
+
+هذه خطوة أولى منطقية، ولكن إذا نظرنا إلى الرموز `"Transformers?"` و `"do."`، فإننا نلاحظ أن علامات الترقيم مُرفقة بالكلمات `"Transformer"` و `"do"`، وهو أمر ليس مثالي. يجب أن نأخذ علامات الترقيم في الاعتبار حتى لا يضطر النموذج إلى تعلم تمثيل مختلف للكلمة وكل رمز ترقيم مُحتمل قد يليها، الأمر الذي من شأنه أن يزيد بشكل هائل عدد التمثيلات التي يجب على النموذج تعلمها.
+مع مراعاة علامات الترقيم، سيُصبح تقسيم  نصنا  على النحو التالي:
+
+```
+["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+أفضل. ومع ذلك، من غير الملائم كيفية تقسيم الكلمة `"Don't"`. `"Don't"` تعني `"do not"`، لذا سيكون من الأفضل تحليلها على أنها كلمتين  مُدمجتين `["Do"، "n't"]`. هنا تبدأ الأمور في التعقيد، وهو جزء من سبب امتلاك كل نموذج لنوّعه  الخاص من مُجزّئ  النصوص (tokenizer). اعتمادًا على القواعد التي نطبقها لتقسيم النص، يسيتم إنشاء مخرجات مُجزّأة  مُختلفة لنفس النص. ولن يؤدي النموذج المُدرب مسبقًا إلى الأداء بشكل صحيح إلا إذا  قُدّم  له مُدخل تم تقسيمه بنفس  القواعد التي تم استخدامها لتقسيم بيانات التدريب الخاصة به.
+
+يُعد كل من [spaCy](https://spacy.io/) و [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) هما مجزّئي النصوص التي تعتمد على القواعد
+الشائعة. عند تطبيقها على مثالنا، فإن *spaCy* و *Moses* ستخرج نّصًا مثل:
+
+```
+["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+كما يمكنك أن ترى، يتم هنا استخدام التقسيم المكاني والترقيم، وكذلك تقسيم الكلمات القائم على القواعد. يعد التقسيم المكاني والترقيم والتحليل القائم على القواعد كلاهما مثالين على تقسيم الكلمات، والذي يُعرّف بشكل غير مُحدد على أنه تقسيم  الجُمل إلى كلمات. في حين أنها الطريقة الأكثر بديهية لتقسيم النصوص إلى أجزاء أصغر،
+يمكن أنها تؤدى إلى مشكلات لمجموعات النصوص الضخمة. في هذه الحالة، عادةً ما يؤدي التقسيم المكاني والترقيم
+إلى إنشاء مفردات كبيرة جدًا (مجموعة من جميع الكلمات والرموز الفريدة المستخدمة). على سبيل المثال، يستخدم [Transformer XL](model_doc/transfo-xl) التقسيم المكاني والترقيم، مما يؤدي إلى حجم مُفردات يبلغ 267735!
+
+يفرض حجم المُفردات الكبير هذا على النموذج أن يكون لديه مصفوفة تضمين (embedding matrix) ضخمة كطبقة إدخال وإخراج، مما يؤدي إلى زيادة كل من التعقيد الزمني والذاكرة. بشكل عام، نادرًا ما يكون لدى نماذج المحولات حجم مفردات
+أكبر من 50000، خاصة إذا تم تدريبها مسبقًا على لغة واحدة فقط.
+
+لذا إذا كان التقسيم المكاني و الترقيم البسيط غير مرضٍ، فلماذا لا نقسّم الحروف ببساطة؟
+
+<Youtube id="ssLq_EK2jLE"/>
+
+في حين أن تقسيم الأحرف بسيط للغاية ومن شأنه أن يقلل بشكل كبير من التعقيد الزمني والذاكرة، إلا أنه يجعل من الصعب
+على النموذج تعلم تمثيلات المدخلات ذات معنى. على سبيل المثال، يعد تعلم تمثيل مستقل عن السياق للحرف "t" أكثر صعوبة من تعلم تمثيل مستقل عن السياق لكلمة "اليوم". لذلك، غالبًا ما يكون تحليل الأحرف مصحوبًا بفقدان الأداء. لذا للحصول على أفضل ما في العالمين، تستخدم نماذج المحولات نظامًا  هجينًا  بين تقسيم على مستوى الكلمة وتقسيم علي مستوى الأحرف يسمى **تقسيم   الوحدات  الفرعية  للّغة**   (subword   tokenization).
+
+## تقسيم الوحدات الفرعية للّغة (Subword Tokenization)
+
+<Youtube id="zHvTiHr506c"/>
+
+تعتمد خوارزميات تقسيم الوحدات الفرعية subword على المبدأ القائل بأن الكلمات الشائعة الاستخدام لا ينبغي تقسيمها إلى وحدات فرعية أصغر، ولكن يجب تفكيك الكلمات النادرة إلى رموز فرعية ذات معنى. على سبيل المثال، قد يتم اعتبار "annoyingly"
+كلمة نادرة ويمكن تحليلها إلى "annoying" و "ly". كل من "annoying" و "ly" كـ subwords مستقلة ستظهر بشكل متكرر أكثر في حين أن معنى "annoyingly" يتم الاحتفاظ به من خلال المعنى المركب لـ "annoying" و "ly". هذا مفيد بشكل خاص في اللغات التلصيقية مثل التركية، حيث يمكنك تشكيل كلمات مُركبة طويلة (تقريبًا) بشكل تعسفي عن طريق ضم الرموز الفرعية معًا.
+
+يسمح تقسيم subword للنموذج بأن يكون له حجم مفردات معقول مع القدرة على تعلم تمثيلات مستقلة عن السياق ذات معنى. بالإضافة إلى ذلك، يمكّن تقسيم subword النموذج من معالجة الكلمات التي لم يسبق له رؤيتها من قبل، عن طريق تحليلها إلى رموز فرعية معروفة. على سبيل المثال، يقوم المحلل [`~transformers.BertTokenizer`] بتحليل"I have a new GPU!" كما يلي:
+
+```py
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> tokenizer.tokenize("I have a new GPU!")
+["i", "have", "a", "new", "gp", "##u", "!"]
+```
+
+نظرًا  لأننا نستخدم  نموذجًا غير حساس لحالة الأحرف (uncased model)، فقد تم تحويل الجملة إلى أحرف صغيرة أولاً. يمكننا أن نرى أن الكلمات `["i"، "have"، "a"، "new"]` موجودة في مفردات  مُجزّئ النصوص، ولكن الكلمة "gpu" غير موجودة. وبالتالي، يقوم مُجزّئ النصوص   بتقسيم "gpu" إلى رموز فرعية معروفة: `["gp" و "##u"]`. يعني "##" أنه يجب ربط بقية الرمز بالرمز السابق، دون مسافة (للترميز أو عكس عملية  تقسيم  الرموز).
+
+كمثال آخر، يقوم المحلل [`~transformers.XLNetTokenizer`] بتقسيم نّص مثالنا السابق كما يلي:
+
+```py
+>>> from transformers import XLNetTokenizer
+
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
+>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+["▁Don", "'", "t", "▁you", "▁love", "▁"، "🤗"، "▁"، "Transform"، "ers"، "؟"، "▁We"، "▁sure"، "▁do"، "."]
+```
+سنعود إلى معنى تلك `"▁"` عندما نلقي نظرة على [SentencePiece](#sentencepiece). كما يمكنك أن ترى،
+تم تقسيم الكلمة النادرة "Transformers" إلى الرموز الفرعية الأكثر تكرارًا `"Transform"` و `"ers"`.
+
+دعنا الآن نلقي نظرة على كيفية عمل خوارزميات تقسيم subword المختلفة. لاحظ أن جميع خوارزميات التقسيم هذه تعتمد على بعض أشكال التدريب الذي يتم عادةً على مجموعة البيانات التي سيتم تدريبها النموذج عليها.
+
+<a id='byte-pair-encoding'></a>
+
+### ترميز الأزواج البايتية (BPE)
+
+تم تقديم رميز أزواج البايت (BPE) في ورقة بحثية بعنوان [الترجمة الآلية العصبية للكلمات النادرة باستخدام وحدات subword (Sennrich et al.، 2015)](https://arxiv.org/abs/1508.07909). يعتمد BPE على مُجزّئ أولي يقسم بيانات التدريب إلى
+كلمات. يمكن أن يكون التحليل المسبق بسيطًا مثل التقسيم المكاني، على سبيل المثال [GPT-2](model_doc/gpt2)، [RoBERTa](model_doc/roberta). تشمل التقسيم الأكثر تقدمًا معتمد على التحليل القائم على القواعد، على سبيل المثال [XLM](model_doc/xlm)، [FlauBERT](model_doc/flaubert) الذي يستخدم Moses لمعظم اللغات، أو [GPT](model_doc/openai-gpt) الذي يستخدم spaCy و ftfy، لحساب تكرار كل كلمة في مجموعة بيانات التدريب.
+
+بعد التحليل المسبق، يتم إنشاء مجموعة من الكلمات الفريدة وقد تم تحديد تكرار كل كلمة في تم تحديد بيانات التدريب. بعد ذلك، يقوم BPE بإنشاء مفردات أساسية تتكون من جميع الرموز التي تحدث في مجموعة الكلمات الفريدة ويتعلم قواعد الدمج لتشكيل رمز جديد من رمزين من المفردات الأساسية. إنه يفعل ذلك حتى تصل المفردات إلى حجم المفردات المطلوب. لاحظ أن حجم المفردات هو فرط معلمة لتحديد قبل تدريب مُجزّئ  النصوص.
+
+كمثال، دعنا نفترض أنه بعد  التقسيم    الأولي، تم تحديد مجموعة الكلمات التالية بما في ذلك تكرارها:
+
+```
+("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
+```
+
+وبالتالي، فإن المفردات الأساسية هي `["b"، "g"، "h"، "n"، "p"، "s"، "u"]`. من خلال تقسيم جميع الكلمات إلى رموز من
+المفردات الأساسية، نحصل على:
+
+```
+("h" "u" "g"، 10)، ("p" "u" "g"، 5)، ("p" "u" "n"، 12)، ("b" "u" "n"، 4)، ("h" "u" "g" "s"، 5)
+```
+
+بعد ذلك، يقوم BPE بعدد مرات حدوث كل زوج من الرموز المحتملة ويختار زوج الرموز الذي يحدث بشكل متكرر. في
+في المثال أعلاه، يحدث "h" متبوعًا بـ "u" _10 + 5 = 15_ مرة (10 مرات في 10 مرات
+حدوث "hug"، 5 مرات في 5 مرات حدوث "hugs"). ومع ذلك، فإن أكثر أزواج الرموز شيوعًا هو "u" متبوعًا
+بواسطة "g"، والتي تحدث _10 + 5 + 5 = 20_ مرة في المجموع. وبالتالي، فإن أول قاعدة دمج يتعلمها المحلل هي تجميع جميع
+رموز "u" التي تتبعها "g" معًا. بعد ذلك، يتم إضافة "ug" إلى المفردات. تصبح مجموعة الكلمات
+
+```
+("h" "ug"، 10)، ("p" "ug"، 5)، ("p" "u" "n"، 12)، ("b" "u" "n"، 4)، ("h" "ug" "s"، 5)
+```
+
+بعد ذلك، يحدد BPE ثاني أكثر أزواج الرموز شيوعًا. إنه "u" متبوعًا بـ "n"، والذي يحدث 16 مرة. "u"،
+يتم دمج "n" في "un" ويضاف إلى المفردات. ثالث أكثر أزواج الرموز شيوعًا هو "h" متبوعًا
+بواسطة "ug"، والتي تحدث 15 مرة. مرة أخرى يتم دمج الزوج ويتم إضافة "hug" إلى المفردات.
+
+في هذه المرحلة، تكون المفردات هي `["b"، "g"، "h"، "n"، "p"، "s"، "u"، "ug"، "un"، "hug"]` ومجموعة الكلمات الفريدة لدينا
+تمثيله كما يلي:
+
+```
+("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
+```
+
+بافتراض أن تدريب ترميز الأزواج البايت سيتوقف عند هذه النقطة، فسيتم تطبيق قواعد الدمج التي تم تعلمها بعد ذلك على الكلمات الجديدة (طالما أن هذه الكلمات الجديدة لا تشمل رموزًا لم تكن في المفردات الأساسية). على سبيل المثال، سيتم تقسيم كلمة "bug" إلى `["b"، "ug"]` ولكن سيتم تقسيم "mug" على أنها `["<unk>"، "ug"]` نظرًا لأن الرمز "m" غير موجود في المفردات الأساسية. بشكل عام، لا يتم استبدال الأحرف الفردية مثل "m" بالرمز "<unk>" لأن بيانات التدريب تتضمن عادةً ظهورًا واحدًا على الأقل لكل حرف، ولكن من المحتمل أن يحدث ذلك لرموز خاصة جدًا مثل الرموز التعبيرية.
+
+كما ذكرنا سابقًا، فإن حجم المفردات، أي حجم المفردات الأساسية + عدد عمليات الدمج، هو معامل يجب اختياره. على سبيل المثال، لدى [GPT](model_doc/openai-gpt) حجم مفردات يبلغ 40478 منذ أن كان لديهم 478 حرفًا أساسيًا واختاروا التوقف عن التدريب بعد 40,000 عملية دمج.
+
+#### ترميز الأزواج البايتية على مستوى البايت
+
+قد تكون المفردات الأساسية التي تتضمن جميع الأحرف الأساسية كبيرة جدًا إذا *على سبيل المثال* تم اعتبار جميع أحرف اليونيكود
+كأحرف أساسية. لذا، ليكون لديك مفردات أساسية أفضل، يستخدم [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) البايتات كمفردات أساسية، وهي حيلة ذكية لإجبار المفردات الأساسية على أن تكون بحجم 256 مع ضمان أن يتم تضمين كل حرف أساسي في المفردات. مع بعض القواعد الإضافية للتعامل مع علامات الترقيم، يمكن لمُجزّئ  النصوص GPT2 تجزئة أي نص دون الحاجة إلى رمز <unk>. لدى [GPT-2](model_doc/gpt) حجم مفردات يبلغ 50257، والذي يتوافق مع رموز 256 base byte، ورمز خاص لنهاية النص والرموز التي تم تعلمها باستخدام 50000 عملية دمج.
+
+<a id='wordpiece'></a>
+
+### WordPiece
+
+تعتبر WordPiece  خوارزمية تجزئة الكلمات الفرعية subword المستخدمة لـ [BERT](model_doc/bert)، [DistilBERT](model_doc/distilbert)، و [Electra](model_doc/electra). تم توضيح الخوارزمية في [البحث الصوتي الياباني والكوري
+(Schuster et al.، 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) وهو مشابه جدًا
+BPE. أولاً، يقوم WordPiece بتكوين المفردات لتضمين كل حرف موجود في بيانات التدريب
+وتعلم تدريجياً عددًا معينًا من قواعد الدمج. على عكس BPE، لا يختار WordPiece أكثر زوج الرموز المتكررة، ولكن تلك التي تزيد من احتمال بيانات التدريب بمجرد إضافتها إلى المفردات.
+
+لذا، ماذا يعني هذا بالضبط؟ بالإشارة إلى المثال السابق، فإن زيادة احتمال بيانات التدريب تعادل إيجاد زوج الرموز، الذي يكون احتمال تقسيمه على احتمالات رمزه الأول تليها رمزه الثاني هو الأكبر بين جميع أزواج الرموز. *مثال* `"u"`، تليها `"g"` كانت قد اندمجت فقط إذا كان احتمال `"ug"` مقسومًا على `"u"`، `"g"` كان سيكون أكبر من أي زوج آخر من الرموز. بديهيًا، WordPiece مختلف قليلاً عن BPE في أنه يقيم ما يفقده عن طريق دمج رمزين للتأكد من أنه يستحق ذلك.
+
+<a id='unigram'></a>
+
+### Unigram
+
+Unigram هو خوارزمية توكنيز subword التي تم تقديمها في [تنظيم subword: تحسين نماذج الترجمة الشبكة العصبية
+نماذج مع مرشحين subword متعددة (Kudo، 2018)](https://arxiv.org/pdf/1804.10959.pdf). على عكس BPE أو
+WordPiece، يقوم Unigram بتكوين مفرداته الأساسية إلى عدد كبير من الرموز ويقللها تدريجياً للحصول على مفردات أصغر. يمكن أن تتوافق المفردات الأساسية على سبيل المثال مع جميع الكلمات المسبقة التوكنز والسلاسل الفرعية الأكثر شيوعًا. لا يتم استخدام Unigram مباشرة لأي من النماذج في المحولات، ولكنه يستخدم بالاقتران مع [SentencePiece](#sentencepiece).
+
+في كل خطوة تدريب، يحدد خوارزمية Unigram خسارة (غالبًا ما يتم تعريفها على أنها اللوغاريتم) عبر بيانات التدريب بالنظر إلى المفردات الحالية ونموذج اللغة unigram. بعد ذلك، بالنسبة لكل رمز في المفردات، يحسب الخوارزمية مقدار زيادة الخسارة الإجمالية إذا تم إزالة الرمز من المفردات. ثم يقوم Unigram بإزالة p (مع p عادة ما تكون 10% أو 20%) في المائة من الرموز التي تكون زيادة الخسارة فيها هي الأدنى، *أي* تلك
+الرموز التي تؤثر أقل على الخسارة الإجمالية عبر بيانات التدريب. تتكرر هذه العملية حتى تصل المفردات إلى الحجم المطلوب. يحتفظ خوارزمية Unigram دائمًا بالشخصيات الأساسية بحيث يمكن توكنز أي كلمة.
+
+نظرًا لأن Unigram لا يعتمد على قواعد الدمج (على عكس BPE وWordPiece)، فإن للخوارزمية عدة طرق
+توكنز نص جديد بعد التدريب. على سبيل المثال، إذا كان محول Unigram المدرب يعرض المفردات:
+
+```
+["b"، "g"، "h"، "n"، "p"، "s"، "u"، "ug"، "un"، "hug"]،
+```
+
+يمكن توكنز `"hugs"` على أنه `["hug"، "s"]`، أو `["h"، "ug"، "s"]` أو `["h"، "u"، "g"، "s"]`. إذن ماذا
+لاختيار؟ يحفظ Unigram احتمال كل رمز في فيلق التدريب بالإضافة إلى حفظ المفردات بحيث
+يمكن حساب احتمال كل توكنز ممكن بعد التدريب. ببساطة، يختار الخوارزمية الأكثر
+توكنز المحتملة في الممارسة، ولكنه يوفر أيضًا إمكانية أخذ عينات من توكنز ممكن وفقًا لاحتمالاتها.
+
+تتم تعريف هذه الاحتمالات بواسطة الخسارة التي يتم تدريب المحول عليها. بافتراض أن بيانات التدريب تتكون
+من الكلمات \\(x_{1}، \dots، x_{N}\\) وأن مجموعة جميع التوكنزات الممكنة لكلمة \\(x_{i}\\) هي
+يتم تعريفها على أنها \\(S(x_{i})\\)، ثم يتم تعريف الخسارة الإجمالية على النحو التالي
+
+$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
+
+<a id='sentencepiece'></a>
+
+### SentencePiece
+
+تحتوي جميع خوارزميات توكنز الموصوفة حتى الآن على نفس المشكلة: من المفترض أن النص المدخل يستخدم المسافات لفصل الكلمات. ومع ذلك، لا تستخدم جميع اللغات المسافات لفصل الكلمات. أحد الحلول الممكنة هو استخداممعالج مسبق للغة محدد، *مثال* [XLM](model_doc/xlm) يلذي يستخدم معالجات مسبقة محددة للصينية واليابانية والتايلاندية.
+لحل هذه المشكلة بشكل أعم، [SentencePiece: A simple and language independent subword tokenizer and
+detokenizer for Neural Text Processing (Kudo et al.، 2018)](https://arxiv.org/pdf/1808.06226.pdf) يتعامل مع المدخلات
+كتدفق بيانات خام، وبالتالي يشمل المسافة في مجموعة الأحرف التي سيتم استخدامها. ثم يستخدم خوارزمية BPE أو unigram
+لبناء المفردات المناسبة.
+
+يستخدم [`XLNetTokenizer`] SentencePiece على سبيل المثال، وهو أيضًا سبب تضمين تم تضمين حرف `"▁"` في المفردات. عملية فك التشفير باستخدام SentencePiece سهلة للغاية نظرًا لأنه يمكن دائمًا دمج الرموز معًا واستبدال `"▁"` بمسافة.
+
+تستخدم جميع نماذج المحولات في المكتبة التي تستخدم SentencePiece بالاقتران مع unigram. أمثلة على النماذج
+باستخدام SentencePiece هي [ALBERT](model_doc/albert)، [XLNet](model_doc/xlnet)، [Marian](model_doc/marian)، و [T5](model_doc/t5).
diff --git a/docs/transformers/docs/source/ar/torchscript.md b/docs/transformers/docs/source/ar/torchscript.md
new file mode 100644
index 0000000000000000000000000000000000000000..bf0bc0dde04b629932d32f85e15529a6c0128305
--- /dev/null
+++ b/docs/transformers/docs/source/ar/torchscript.md
@@ -0,0 +1,154 @@
+# التصدير إلى TorchScript
+
+<Tip>
+
+هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين  الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة.
+
+</Tip>
+
+وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html):
+
+> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية.
+
+هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء.
+
+نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript.
+
+يتطلب تصدير نموذج أمرين:
+
+- تهيئة مثيل للنموذج باستخدام علامة `torchscript`
+- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج
+
+تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه.
+
+## علامة TorchScript والأوزان المرتبطة
+
+علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا.
+
+النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة.
+
+هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`.
+
+## المدخلات الوهمية والأطوال القياسية
+
+تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج.
+
+يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي:
+
+```
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+```
+
+نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات.
+
+انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل.
+
+## استخدام TorchScript في Python
+
+يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال.
+
+### حفظ نموذج
+
+لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+### تحميل نموذج
+
+يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا:
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+### استخدام نموذج مُتتبع للاستدلال
+
+استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK
+
+قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي:
+
+1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية.
+2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
+3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+### الآثار المترتبة
+
+تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron.
+
+### التبعيات (Dependencies)
+
+يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+### تحويل نموذج لـ AWS Neuron
+
+قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+
+كل ما عليك فعله هو تعديل السطر التالي:
+
+```diff
+- torch.jit.trace(model, [tokens_tensor, segments_tensors])
++ torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1.
+
+لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
diff --git a/docs/transformers/docs/source/ar/trainer.md b/docs/transformers/docs/source/ar/trainer.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6bc21974430320811e34699be9400124fe9e41b
--- /dev/null
+++ b/docs/transformers/docs/source/ar/trainer.md
@@ -0,0 +1,721 @@
+# Trainer
+
+تُتيح وحدة [`Trainer`] حلقة تدريب وتقييم متكاملة لنماذج PyTorch المطبقة في مكتبة Transformers. تحتاج فقط إلى تمرير المكونات الضرورية للتدريب (النموذج، والمجزىء النصى، ومجموعة البيانات، دالة التقييم، معلمات التدريب الفائقة، إلخ)، وستتولى فئة [`Trainer`] الباقي. هذا يُسهّل بدء التدريب بشكل أسرع دون كتابة حلقة التدريب الخاصة بك يدويًا. ولكن في الوقت نفسه، فإن [`Trainer`] قابل للتخصيص بدرجة كبيرة ويوفر العديد من خيارات التدريب حتى تتمكن من تخصيصه وفقًا لاحتياجات التدريب الخاصة بك بدقة.
+
+<Tip>
+
+بالإضافة إلى فئة [`Trainer`], توفر مكتبة Transformers أيضًا فئة [`Seq2SeqTrainer`] للمهام التسلسلية مثل الترجمة أو التلخيص. هناك أيضًا فئة [`~trl.SFTTrainer`] من مكتبة [TRL](https://hf.co/docs/trl) التي تغلّف فئة [`Trainer`] وهي مُحُسَّنة لتدريب نماذج اللغة مثل Llama-2 وMistral باستخدام تقنيات التوليد اللغوي. كما يدعم [`~trl.SFTTrainer`] ميزات مثل حزم التسلسلات، وLoRA، والقياس الكمي، وDeepSpeed مما يُمكّن من التدريب بكفاءة على نماذج ضخمة الحجم.
+
+<br>
+
+لا تتردد في الاطلاع على [مرجع API](./main_classes/trainer) لهذه الفئات الأخرى من النوع [`Trainer`] لمعرفة المزيد حول متى يتم استخدام كل منها. بشكل عام، [`Trainer`] هو الخيار الأكثر تنوعًا ومناسبًا لمجموعة واسعة من المهام. تم تصميم [`Seq2SeqTrainer`] للمهام التسلسلية ، و [`~trl.SFTTrainer`] مُصمم لتدريب نماذج اللغة الكبيرة.
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت مكتبة [Accelerate](https://hf.co/docs/accelerate) - وهي مكتبة تُمكّن تشغيل تدريب PyTorch في بيئات مُوزعة.
+
+```bash
+pip install accelerate
+
+# upgrade
+pip install accelerate --upgrade
+```
+
+يوفر هذا الدليل نظرة عامة على فئة [`Trainer`].
+
+## الاستخدام الأساسي
+
+يتضمن [`Trainer`] جميع التعليمات البرمجية التي ستجدها في حلقة التدريب الأساسية:
+
+1. قم بتنفيذ خطوة تدريب لحساب الخسارة
+2. احسب المشتقات باستخدام طريقة [`~accelerate.Accelerator.backward`]
+3. تحديث الأوزان بناءً على المشتقات
+4. كرر هذه العملية حتى تصل إلى عدد محدد مسبقًا من الدورات (epochs).
+
+تُجرد فئة [`Trainer`] كل هذه التعليمات البرمجية حتى لا تضطر إلى القلق بشأن كتابة حلقة تدريب يدويًا في كل مرة أما إذا كنت بدأت للتو في PyTorch والتدريب. كل ما عليك فعله هو توفير المكونات الأساسية اللازمة للتدريب، مثل النموذج ومجموعة بيانات، وتتعامل فئة [`Trainer`] مع كل شيء آخر.
+
+إذا كنت تُريد تحديد أي خيارات تدريب أو معلمات فائقة، فيمكنك العثور عليها في فئة [`TrainingArguments`]. على سبيل المثال، دعنا نحدد أين يتم حفظ النموذج في `output_dir` ورفع النموذج إلى Hub بعد التدريب باستخدام `push_to_hub=True`.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model"،
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch"،
+    save_strategy="epoch"،
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+```
+مرر `training_args` إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة بيانات، وشئ لمعالجة مجموعة البيانات مسبقًا (حسب نوع البيانات، فقد يكون محللًا رمزيًا أو مستخرج ميزات أو معالج صور)، وجامع بيانات، ودالة لحساب المقاييس التي تُريد تتبعها أثناء التدريب.
+
+أخيرًا، استدعِ [`~Trainer.train`] لبدء التدريب!
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"]،
+    eval_dataset=dataset["test"]،
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+### نقاط الحفظ
+
+تحفظ فئة [`Trainer`] نقاط الحفظ النموذج في الدليل المحدد في معامل `output_dir` من [`TrainingArguments`]. ستجد نقاط الحفظ في مجلد فرعي يسمى `checkpoint-000` حيث تتوافق الأرقام في النهاية مع خطوة التدريب. إن حفظ نقاط الحفظ مفيد لاستئناف التدريب لاحقًا.
+
+```py
+# استأنف من أحدث نقطة حفظ
+trainer.train(resume_from_checkpoint=True)
+
+# استأنف من نقطة حفظ محددة محفوظة في دليل الإخراج
+trainer.train(resume_from_checkpoint="your-model/checkpoint-1000")
+```
+
+يمكنك حفظ نقاط الحفظ الخاصة بك (لا يتم حفظ حالة المُجزىء اللغوى تقائيًا)  إلى Hub عن طريق تعيين `push_to_hub=True` في [`TrainingArguments`] لرفعها. الخيارات الأخرى لاتخاذ القرار بشأن كيفية حفظ هذة النقاط الخاصة بك هي الإعداد في معامل [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy):
+
+* `hub_strategy="checkpoint"` يدفع أحدث نقطة حفظ إلى مجلد فرعي يسمى "last-checkpoint" يمكنك استئناف التدريب منه
+* `hub_strategy="all_checkpoints"` يدفع جميع نقاط الحفظ إلى الدليل المحدد في `output_dir` (سترى نقطة حفظ واحدة لكل مجلد في مستودع النموذج الخاص بك)
+
+عند استئناف التدريب من نقطة حفظ، تُحاول [`Trainer`] الحفاظ على حالات RNG Python وNumPy وPyTorch كما كانت عندما تم حفظ نقطة الحفظ. ولكن لأن PyTorch لديها العديد من الإعدادات الافتراضية غير الحتمية مُتنوعة، فإن حالات RNG ليست مضمونة لتكون هي نفسها. إذا كنت تريد تمكين الحتمية الكاملة، فراجع دليل [التحكم في مصادر العشوائية](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) لمعرفة ما يُمكنك تمكينه لجعل تدريبك حتميًا تمامًا. ضع في اعتبارك أنه من خلال جعل إعدادات معينة حتمية، فقد يكون التدريب أبطأ.
+
+## تخصيص المدرب
+
+في حين أن فئة [`Trainer`] مُصممة لتكون سهلة الوصول وسهلة الاستخدام، فإنها توفر أيضًا الكثير من قابلية التخصيص للمستخدمين المغامرين.  يُمكن إنشاء فئات فرعية من العديد من أساليب [`Trainer`] وتجاوزها لدعم الوظائف التي تُريدها، دون الحاجة إلى إعادة كتابة حلقة التدريب بأكملها من البداية لاستيعابها. تتضمن هذه الأساليب:
+
+* [`~Trainer.get_train_dataloader`] ينشئ DataLoader للتدريب
+* [`~Trainer.get_eval_dataloader`] ينشئ DataLoader للتقييم
+* [`~Trainer.get_test_dataloader`] ينشئ DataLoader للاختبار
+* [`~Trainer.log`] يسجل معلومات حول مختلف الكائنات التي تراقب التدريب
+* [`~Trainer.create_optimizer_and_scheduler`] ينشئ محسنًا ومخططًا لمُعدل التعلم إذا لم يتم تمريرهما في `__init__`؛ يمكن أيضًا تخصيص هذه الوظائف بشكل منفصل باستخدام [`~Trainer.create_optimizer`] و [`~Trainer.create_scheduler`] على التوالي
+* [`~Trainer.compute_loss`] يحسب دالة الخسارة على دفعة من مُدخلات التدريب
+* [`~Trainer.training_step`] يُنفذ خطوة التدريب
+* [`~Trainer.prediction_step`] يُنفذ خطوة التنبؤ والاختبار
+* [`~Trainer.evaluate`] يُقيّم النموذج ويعيد مقاييس التقييم
+* [`~Trainer.predict`] يُجري التنبؤات (مع المقاييس إذا كانت العلامات متاحة) على مجموعة الاختبار
+
+على سبيل المثال، إذا كنت تريد تخصيص طريقة [`~Trainer.compute_loss`] لاستخدام دالة خسارة ذات ترجيح بدلاً من ذلك.
+
+
+```py
+from torch import nn
+from transformers import Trainer
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss for 3 labels with different weights
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+### دوال الاستدعاء Callbacks
+
+خيار آخر لتخصيص [`Trainer`] هو استخدام [دوال الاستدعاء](callbacks). لا *تغير* دوال الاستدعاء أي شيء في حلقة التدريب. إنهم تفحص حالة حلقة التدريب ثم تُنفذ بعض الإجراءات (مثل الإيقاف المبكر أو تسجيل النتائج، إلخ) اعتمادًا على الحالة. وبعبارة أخرى، لا يمكن استخدام دالة الاستدعاء لتنفيذ شيء مثل دالة خسارة مخصصة، ويجب عليك تجاوز دالة [`~Trainer.compute_loss`] لذلك.
+
+على سبيل المثال، إذا كنت تريد إضافة دالة استدعاء إيقاف مبكر إلى حلقة التدريب بعد 10 خطوات.
+
+```py
+from transformers import TrainerCallback
+
+class EarlyStoppingCallback(TrainerCallback):
+    def __init__(self, num_steps=10):
+        self.num_steps = num_steps
+    
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.global_step >= self.num_steps:
+            return {"should_training_stop": True}
+        else:
+            return {}
+```
+
+ثم مرره إلى معامل `callback` في [`Trainer`].
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"]،
+    eval_dataset=dataset["test"]،
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+    callback=[EarlyStoppingCallback()],
+)
+```
+
+## تسجيل الأحداث (Logging)
+
+<Tip>
+
+راجع مرجع [API](./main_classes/logging) للتسجيل للحصول على مزيد من المعلومات حول مستويات التسجيل المختلفة للأحداث.
+
+</Tip>
+
+يتم تعيين [`Trainer`] إلى `logging.INFO` افتراضيًا والذي يُبلغ عن الأخطاء والتحذيرات ومعلومات أساسية أخرى. يتم تعيين نسخة [`Trainer`] - في البيئات الموزعة - إلى `logging.WARNING` والتي يُبلغ فقط عن الأخطاء والتحذيرات. يمكنك تغيير مستوى تسجيل الأحداث باستخدام معاملي [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) و [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) في [`TrainingArguments`].
+
+لتهيئة إعداد مُستوى تسجيل  اﻷحداث لكل عقدة، استخدم معامل [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) لتحديد ما إذا كان سيتم استخدام مُستوى السجل على كل عقدة أو فقط على العقدة الرئيسية.
+
+<Tip>
+
+يحدد [`Trainer`] مُستوى التسجيل بشكل مُنفصل لكل عقدة في طريقة [`Trainer.__init__`]، لذا فقد ترغب في التفكير في تعيين هذا الإعداد في وقت سابق إذا كنت تستخدم وظائف Transformers الأخرى قبل إنشاء كائن [`Trainer`].
+
+</Tip>
+
+على سبيل المثال، لتعيين التعليمات البرمجية والوحدات النمطية الرئيسية الخاصة بك لاستخدام نفس مُستوى التسجيل وفقًا لكل عقدة:
+
+```py
+logger = logging.getLogger(__name__)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"،
+    datefmt="%m/%d/%Y %H:%M:%S"،
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+استخدم تركيبات مختلفة من `log_level` و `log_level_replica` لتهيئة ما يتم تسجيله على كل من العقد.
+
+
+<hfoptions id="logging">
+<hfoption id="single node">
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+</hfoption>
+<hfoption id="multi-node">
+
+أضف معلمة `log_on_each_node 0` لبيئات متعددة العقد.
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+
+# set to only report errors
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+</hfoption>
+</hfoptions>
+
+## NEFTune
+
+[NEFTune](https://hf.co/papers/2310.05914) هي تقنية يمكن أن تحسن الأداء عن طريق إضافة ضوضاء إلى مُتجهات التعلم أثناء التدريب. لتمكينه في [`Trainer`], قم بتعيين معامل `neftune_noise_alpha` في [`TrainingArguments`] للتحكم في مقدار الضوضاء المُضافة.
+
+```py
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=training_args)
+```
+
+يتم تعطيل NEFTune بعد التدريب لاستعادة طبقة التعلم الأصلية لتجنب أي سلوك غير متوقع.
+
+## نواة Liger
+[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel هي مجموعة من نوى Triton التي طورتها Linkedin مُصممة خصيصًا لتدريب نماذج اللغة الكبيرة (LLM). لقد قمنا بتنفيذ RMSNorm و RoPE و SwiGLU و CrossEntropy و FusedLinearCrossEntropy مُتوافقة مع Hugging Face، والمزيد قادم. يُمكنها زيادة إنتاجية التدريب متعدد وحدات معالجة الرسومات (GPU) بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪. تعمل النواة بشكل تلقائي مع flash attention و PyTorch FSDP و Microsoft DeepSpeed.
+
+احصل على زيادة في الإنتاجية بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪ على تدريب نماذج LLaMA 3-8B. حقق أطوال سياق أكبر وأحجام دفعات أكبر. كما أنها مُفيدة إذا كنت تُريد زيادة حجم نموذجك إلى تدريب بنماذج متعددة الرؤوس أو أحجام مُفردات ضخمة. أطلق العنان للتدريب بنماذج متعددة الرؤوس (medusa) والمزيد. راجع التفاصيل والأمثلة في [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples)
+تأكد أولاً من تثبيت مستودع Liger الرسمي:
+```bash
+pip install liger-kernel
+```
+يجب عليك تمرير `use_liger_kernel=True` لتطبيق نواة `liger` على نموذجك، على سبيل المثال:
+
+```python
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+    use_liger_kernel=True
+)
+```
+
+تدعم النواة معماريات نماذج Llama و Gemma و Mistral و Mixtral. يُمكن العثور على أحدث قائمة بالنمائج المدعومة [هنا](https://github.com/linkedin/Liger-Kernel). عندما يتم تعيين `use_liger_kernel` إلى `True`، سيتم تصحيح الطبقات المُقابلة في النموذج الأصلي باستخدام تطبيق Liger الفعال، لذلك لا تحتاج إلى فعل أي شيء إضافي بخلاف تعيين قيمة المعامل.
+
+## المُحسِّنات
+يمكنك اختيار مُحسِّن مدمج للتدريب باستخدام:
+```python
+from transformers import TrainingArguments
+training_args = TrainingArguments(..., optim="adamw_torch")
+```
+اطلع على [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) للاطلاع على القائمة الكاملة للخيارات. نُدرج أمثلة مُتقدمة في الأقسام أدناه.
+
+يمكنك أيضًا استخدام مُحسِّن PyTorch عشوائي عبر:
+```python
+import torch
+
+optimizer_cls = torch.optim.AdamW
+optimizer_kwargs = {
+    "lr": 4e-3,
+    "betas": (0.9, 0.999),
+    "weight_decay": 0.05,
+}
+
+from transformers import Trainer
+trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs))
+```
+
+
+
+
+### GaLore
+
+إسقاط التدرج ذو الرتبة المنخفضة (GaLore) هو إستراتيجية تدريب ذات رتبة منخفضة فعّالة من حيث الذاكرة، تسمح بتعلم المعلمات الكاملة ولكنها أكثر كفاءة من حيث الذاكرة من أساليب التكيّف الشائعة ذات الرتبة المنخفضة، مثل LoRA.
+
+أولاً، تأكد من تثبيت المستودع الرسمي لـ GaLore:
+
+```bash
+pip install galore-torch
+```
+
+ثم أضف ببساطة أحد `["galore_adamw"، "galore_adafactor"، "galore_adamw_8bit"]` في `optim` جنبًا إلى جنب مع `optim_target_modules`، والتي يمكن أن تكون قائمة من السلاسل أو التعبيرات النمطية regex أو المسار الكامل المطابق لأسماء الوحدات المستهدفة التي تريد تكييفها. فيما يلي مثال على النص البرمجي كامل(تأكد من `pip install trl datasets`):
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore"،
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw"،
+    optim_target_modules=[r".*.attn.*"، r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+
+لتمرير معامﻻت إضافية يدعمها  GaLore، يجب عليك تمرير `optim_args` بشكل صحيح، على سبيل المثال:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore",
+    max_steps=100,
+    per_device_train_batch_size=2,
+    optim="galore_adamw",
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
+    optim_args="rank=64, update_proj_gap=100, scale=0.10",
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=512,
+)
+
+trainer.train()
+```
+يمكنك قراءة المزيد حول الطريقة في [المستودع الأصلي](https://github.com/jiaweizzhao/GaLore) أو [الورقة البحثية](https://arxiv.org/abs/2403.03507).
+
+حاليًا، يمكنك فقط تدريب الطبقات الخطية التي تعتبر طبقات GaLore وستستخدم التحلل  ذو الرتبة المنخفضة للتدريب بينما سيتم تحسين الطبقات المتبقية بالطريقة التقليدية.
+
+لاحظ أنه سيستغرق الأمر بعض الوقت قبل بدء التدريب (~3 دقائق لنموذج 2B على NVIDIA A100)، ولكن يجب أن يسير التدريب بسلاسة بعد ذلك.
+
+يمكنك أيضًا إجراء تحسين طبقة تلو الأخرى عن طريق إضافة `layerwise` إلى اسم المُحسِّن كما هو موضح أدناه:
+
+```python
+import torch
+import datasets
+import trl
+
+from transformers import TrainingArguments، AutoConfig، AutoTokenizer، AutoModelForCausalLM
+
+train_dataset = datasets.load_dataset('imdb'، split='train')
+
+args = TrainingArguments(
+    output_dir="./test-galore"،
+    max_steps=100،
+    per_device_train_batch_size=2،
+    optim="galore_adamw_layerwise"،
+    optim_target_modules=[r".*.attn.*"، r".*.mlp.*"]
+)
+
+model_id = "google/gemma-2b"
+
+config = AutoConfig.from_pretrained(model_id)
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_config(config).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model،
+    args=args،
+    train_dataset=train_dataset،
+    dataset_text_field='text'،
+    max_seq_length=512،
+)
+
+trainer.train()
+```
+
+لاحظ أن تحسين الطبقة تجريبي إلى حد ما ولا يدعم DDP (Distributed Data Parallel)، وبالتالي يمكنك تشغيل التعليمات البرمجية  للتدريب على وحدة معالجة الرسومات (GPU) واحدة فقط. يرجى الاطلاع على [هذا القسم المناسب](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) لمزيد من التفاصيل. قد لا تدعم الميزات الأخرى مثل تقليم التدرجات أو DeepSpeed، إلخ. من الصندوق. يرجى [تقديم تقرير عن المشكلة على GitHub](https://github.com/huggingface/transformers/issues) إذا واجهتك مثل هذه المشكلة.
+
+### محسنات LOMO
+
+تم تقديم مُحسِّنات LOMO في [التدريب على المعلمات الكاملة لنماذج اللغة الكبيرة باستخدام موارد محدودة](https://hf.co/papers/2306.09782) و [AdaLomo: تحسين ذاكرة منخفضة بمعدل تعلم متكيف](https://hf.co/papers/2310.10195).
+يتكون كلاهما من طريقة فعالة لضبط المعلمات الكاملة. تدمج محسنات LOMO حساب الاشتقاق وتحديث المعلمات في خطوة واحدة لتقليل استخدام الذاكرة. محسنات LOMO المدعومة هي `"lomo"` و `"adalomo"`. أولاً قم بتثبيت LOMO من pypi `pip install lomo-optim` أو قم بتثبيته من المصدر باستخدام `pip install git+https://github.com/OpenLMLab/LOMO.git`.
+
+<Tip>
+
+وفقًا للمؤلفين، يوصى باستخدام `AdaLomo` بدون `grad_norm` للحصول على أداء أفضل وسرعة أعلى.
+
+</Tip>
+
+فيما يلي نص برمجي بسيط يوضح كيفية ضبط نموذج [google/gemma-2b](https://huggingface.co/google/gemma-2b) على مجموعة بيانات IMDB في الدقة الكاملة:
+
+```python
+import torch
+import datasets
+from transformers import TrainingArguments، AutoTokenizer، AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb'، split='train')
+
+args = TrainingArguments(
+    output_dir="./test-lomo"،
+    max_steps=100،
+    per_device_train_batch_size=4،
+    optim="adalomo"،
+    gradient_checkpointing=True،
+    logging_strategy="steps"،
+    logging_steps=1،
+    learning_rate=2e-6،
+    save_strategy="no"،
+    run_name="lomo-imdb"،
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model،
+    args=args،
+    train_dataset=train_dataset،
+    dataset_text_field='text'،
+    max_seq_length=1024،
+)
+
+trainer.train()
+```
+
+### مُحسِّن GrokAdamW
+تم تصميم مُحسِّن GrokAdamW لتعزيز أداء التدريب واستقراره، خاصةً للنماذج التي تستفيد من دوال إشارة `grokking`. لاستخدام `GrokAdamW`، قم أولاً بتثبيت حزمة المُحسِّن باستخدام `pip install grokadamw`.
+<Tip>
+يُعد GrokAdamW مفيدًا بشكل خاص للنماذج التي تتطلب تقنيات تحسين مُتقدمة لتحقيق أداء واستقرار أفضل.
+</Tip>
+
+فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW:
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
+
+# تحميل مجموعة البيانات IMDB
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+# تعريف معامﻻت التدريب
+args = TrainingArguments(
+    output_dir="./test-grokadamw",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="grokadamw",
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-5,
+    save_strategy="no",
+    run_name="grokadamw-imdb",
+)
+
+# تحميل النموذج والمجزىء اللغوي
+model_id = "google/gemma-2b"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+# تهيئة المدرب
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=train_dataset,
+)
+
+# تدريب النموذج
+trainer.train()
+```
+يوضح هذا النص البرمجى كيفية ضبط نموذج google/gemma-2b بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW. يتم تكوين TrainingArguments لاستخدام GrokAdamW، ويتم تمرير مجموعة البيانات إلى Trainer للتدريب.
+
+### مُحسِّن بدون جدوله (Schedule Free Optimizer)
+تم تقديم مُحسِّنات بدون جدوله في [The Road Less Scheduled](https://hf.co/papers/2405.15682).
+يستبدل التعلم بدون جدوله زخم المُحسِّن الأساسي بمزيج من المتوسط ​​والتداخل، لإزالة الحاجة تمامًا إلى تخفيف مُعدل التعلم باستخدام جدوله تقليديه.
+المُحسِّنات المدعومة لـ SFO هي "schedule_free_adamw" و "schedule_free_sgd". قم أولاً بتثبيت `schedulefree` من pypi باستخدام الأمر  `pip install schedulefree`.
+
+فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB بدقة كاملة:
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-schedulefree",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="schedule_free_adamw",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="sfo-imdb",
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=1024,
+)
+
+trainer.train()
+```
+## تسريع ومدرب
+
+يتم تشغيل فئة [`Trainer`] بواسطة [تسريع](https://hf.co/docs/accelerate)، وهي مكتبة لتدريب نماذج PyTorch بسهولة في بيئات موزعة مع دعم عمليات التكامل مثل [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) و [DeepSpeed](https://www.deepspeed.ai/).
+
+<Tip>
+
+تعرف على المزيد حول استراتيجيات تجزئة FSDP، وتفريغ وحدة المعالجة المركزية (CPU)، والمزيد مع [`Trainer`] في [دليل Fully Sharded Data Parallel](fsdp).
+
+</Tip>
+
+لاستخدام Accelerate مع [`Trainer`]]، قم بتشغيل الأمر [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) لإعداد التدريب لبيئة التدريب الخاصة بك. نشئ هذا الأمر ملف `config_file.yaml` الذي سيتم استخدامه عند تشغيل نص للتدريب البرمجى. على سبيل المثال، بعض تكوينات المثال التي يمكنك إعدادها هي:
+
+<hfoptions id="config">
+<hfoption id="DistributedDataParallel">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+distributed_type: MULTI_GPU                                                                                                    
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0 #change rank as per the node
+main_process_ip: 192.168.20.1
+main_process_port: 9898
+main_training_function: main
+mixed_precision: fp16
+num_machines: 2
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="FSDP">
+
+```yml
+compute_environment: LOCAL_MACHINE
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+```yml
+compute_environment: LOCAL_MACHINE
+deepspeed_config:
+  deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+  zero3_init_flag: true
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+<hfoption id="DeepSpeed with Accelerate plugin">
+
+```yml
+compute_environment: LOCAL_MACHINE                                                                                             
+deepspeed_config:                                                                                                              
+  gradient_accumulation_steps: 1
+  gradient_clipping: 0.7
+  offload_optimizer_device: cpu
+  offload_param_device: cpu
+  zero3_init_flag: true
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+</hfoption>
+
+</hfoptions>
+يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.
+
+على سبيل المثال، لتشغيل النص البرنامجي للتدريب [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) مع تكوين FSDP:
+
+```bash
+accelerate launch \
+    ./examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path google-bert/bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+يمكنك أيضًا تحديد المعلمات من ملف `config_file.yaml` مباشرة في سطر الأوامر:
+
+```bash
+accelerate launch --num_processes=2 \
+    --use_fsdp \
+    --mixed_precision=bf16 \
+    --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+    --fsdp_sharding_strategy=1 \
+    --fsdp_state_dict_type=FULL_STATE_DICT \
+    ./examples/pytorch/text-classification/run_glue.py
+    --model_name_or_path google-bert/bert-base-cased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size 16 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3 \
+    --output_dir /tmp/$TASK_NAME/ \
+    --overwrite_output_dir
+```
+
+اطلع على برنامج تعليمي [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) لمعرفة المزيد حول `accelerate_launch` والتكوينات المخصصة.
diff --git a/docs/transformers/docs/source/ar/training.md b/docs/transformers/docs/source/ar/training.md
new file mode 100644
index 0000000000000000000000000000000000000000..d3e354ff8b1af3b58a72a7a24070452ac49c5d48
--- /dev/null
+++ b/docs/transformers/docs/source/ar/training.md
@@ -0,0 +1,412 @@
+# ضبط نموذج مُدرب مسبقًا
+
+هناك فوائد كبيرة لاستخدام نموذج مُدرب مسبقًا. فهو يقلل من تكاليف الحوسبة، ويحد من أثرنا البيئي، ويتيح لك استخدام أحدث النماذج دون الحاجة إلى تدريبها من الصفر. توفر مكتبة 🤗 Transformers إمكانية الوصول إلى آلاف النماذج المُدربة مسبقًا لمجموعة واسعة من المهام. عندما تستخدم نموذجًا مُدربًا مسبقًا، فإنك تقوم بتدريبه على مجموعة بيانات خاصة بمهمتك. يُعرف ذلك بالضبط الدقيق، وهي تقنية تدريب قوية للغاية. في هذا البرنامج التعليمي، سوف تقوم بضبط نموذج مُدرب مسبقًا باستخدام إطار عمل للتعلم العميق الذي تختاره:
+
+* ضبط نموذج مُدرب مسبقًا باستخدام 🤗 Transformers [`Trainer`].
+* ضبط نموذج مُدرب مسبقًا في TensorFlow باستخدام Keras.
+* ضبط نموذج مُدرب مسبقًا في PyTorch الأصلي.
+
+<a id='data-processing'></a>
+
+## إعداد مجموعة بيانات
+
+قبل أن تتمكن من ضبط نموذج مُدرب مسبقًا، قم بتنزيل مجموعة بيانات وإعدادها للتدريب. أظهر البرنامج التعليمي السابق كيفية معالجة البيانات للتدريب، والآن لديك الفرصة لاختبار تلك المهارات!
+
+ابدأ بتحميل مجموعة بيانات [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+كما تعلم الآن، تحتاج إلى محول نص إلى رمز (tokenizer) لمعالجة النص وتضمين استراتيجيات للحشو والقص للتعامل مع أي أطوال متسلسلة متغيرة. لمعالجة مجموعة البيانات الخاصة بك في خطوة واحدة، استخدم طريقة 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) لتطبيق دالة معالجة مسبقة على مجموعة البيانات بأكملها:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+إذا كنت ترغب، يمكنك إنشاء مجموعة فرعية أصغر من مجموعة البيانات الكاملة لضبطها لتقليل الوقت الذي تستغرقه:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## التدريب
+
+في هذه المرحلة، يجب عليك اتباع القسم الذي يتوافق مع الإطار الذي تريد استخدامه. يمكنك استخدام الروابط
+في شريط التنقل الأيمن للقفز إلى الإطار الذي تريده - وإذا كنت تريد إخفاء كل المحتوى لإطار معين،
+فاستخدم الزر في الركن العلوي الأيمن من كتلة الإطار!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## التدريب باستخدام PyTorch Trainer
+
+تقدم مكتبة 🤗 Transformers فئة [`Trainer`] مُحسّنة لتدريب نماذج 🤗 Transformers، مما يسهل بدء التدريب دون الحاجة إلى كتابة حلقة التدريب الخاصة بك يدويًا. تدعم واجهة برمجة تطبيقات [`Trainer`] مجموعة واسعة من خيارات التدريب والميزات مثل التسجيل، وتراكم التدرجات، والدقة المختلطة.
+
+ابدأ بتحميل نموذجك وتحديد عدد التصنيفات المتوقعة. من بطاقة مجموعة بيانات Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields)، تعرف أنه يوجد خمسة تصنيفات:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+سترى تحذيرًا بشأن بعض أوزان النموذج المُدرب مسبقًا لن تُستخدم وبعض الأوزان الأخرى ستُبدء بشكل عشوائي. لا تقلق، هذا أمر طبيعي تمامًا! يتم التخلص من رأس النموذج المُدرب مسبقًا لشبكة BERT، ويتم استبداله برأس تصنيف يُبدء بشكل عشوائي. سوف تقوم بضبط الرأس الجديد للنموذج بدقة على مهمة تصنيف التسلسلات الخاصة بك، مما ينقل المعرفة من النموذج المُدرب مسبقًا إليه.
+
+</Tip>
+
+### اختيار أحسن العوامل والمتغيرات للتدريب (Training hyperparameters)
+
+بعد ذلك، قم بإنشاء كائن من فئة [`TrainingArguments`] والتي تحتوي على جميع العوامل والمتغيرات التي يمكنك ضبطها بالإضافة إلى خيارات تنشيط التدريب المختلفة. بالنسبة لهذا البرنامج التعليمي، يمكنك البدء بمعاملات التدريب الافتراضية [hyperparameters](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)، ولكن لا تتردد في تجربتها للعثور على الإعدادات المثلى.
+
+حدد مكان حفظ النسخ من تدريبك:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### التقييم
+
+لا يقوم [`Trainer`] تلقائيًا بتقييم أداء النموذج أثناء التدريب. ستحتاج إلى تمرير دالة إلى [`Trainer`] لحساب وإبلاغ المقاييس. توفر مكتبة [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) دالة [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) بسيطة يمكنك تحميلها باستخدام الدالة [`evaluate.load`] (راجع هذا [الدليل السريع](https://huggingface.co/docs/evaluate/a_quick_tour) لمزيد من المعلومات):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+استدعِ دالة [`~evaluate.compute`] على `metric` لحساب دقة تنبؤاتك. قبل تمرير تنبؤاتك إلى دالة `compute`، تحتاج إلى تحويل  النتائج الخام logits إلى تنبؤات نهائية (تذكر أن جميع نماذج 🤗 Transformers تعيد نتائج الخام logits):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits، labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+إذا كنت ترغب في مراقبة مقاييس التقييم الخاصة بك أثناء الضبط الدقيق، فحدد معلمة `eval_strategy` في معاملات التدريب الخاصة بك لإظهار مقياس التقييم في نهاية كل حقبة تدريبه:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
+```
+
+### المدرب
+
+قم بإنشاء كائن [`Trainer`] باستخدام نموذجك، ومعاملات التدريب، ومجموعات البيانات التدريبية والاختبارية، ودالة التقييم:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+ثم قم بضبط نموذجك عن طريق استدعاء [`~transformers.Trainer.train`]:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## تدريب نموذج TensorFlow باستخدام Keras
+
+يمكنك أيضًا تدريب نماذج 🤗 Transformers في TensorFlow باستخدام واجهة برمجة تطبيقات Keras!
+
+### تحميل البيانات لـ Keras
+
+عندما تريد تدريب نموذج 🤗 Transformers باستخدام واجهة برمجة تطبيقات Keras، فأنت بحاجة إلى تحويل مجموعة البيانات الخاصة بك إلى تنسيق يفهمه
+Keras. إذا كانت مجموعة البيانات الخاصة بك صغيرة، فيمكنك ببساطة تحويلها بالكامل إلى مصفوفات NumPy وإرسالها إلى Keras.
+دعونا نجرب ذلك أولاً قبل أن نقوم بأي شيء أكثر تعقيدًا.
+
+أولاً، قم بتحميل مجموعة بيانات. سنستخدم مجموعة بيانات CoLA من معيار [GLUE benchmark](https://huggingface.co/datasets/glue)،
+نظرًا لأنه مهمة تصنيف نص ثنائي بسيطة، وسنأخذ فقط قسم التدريب الآن.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue"، "cola")
+dataset = dataset ["train"] # خذ فقط قسم التدريب الآن
+```
+
+بعد ذلك، قم بتحميل أداة المُجزّئ اللغوي وقم بترميز البيانات كمصفوفات NumPy. لاحظ أن التصنيفات هي بالفعل قائمة من 0 و 1،
+لذا يمكننا ببساطة تحويل ذلك مباشرة إلى مصفوفة NumPy بدون ترميز!
+
+```py
+from transformers import AutoTokenizer
+import numpy as np
+
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+أخيرًا، قم بتحميل وتجميع وتناسب النموذج. لاحظ أن نماذج Transformers تحتوي جميعها على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذا فأنت لست بحاجة إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# تحميل وتجميع النموذج الخاص بنا
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
+# معدلات التعلم المنخفضة أفضل غالبًا لضبط النماذج الدقيقة
+model.compile(optimizer=Adam(3e-5)) # لا توجد دالة خسارة!
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+أنت لست مضطرًا لتمرير دالة خسارة إلى نماذجك عند تجميعها! تختار نماذج Hugging Face تلقائيًا
+دالة خسارة مناسبة لمهمتها وهندسة نموذجها إذا تُركت هذه الحجة فارغة. يمكنك دائمًا
+تجاوز ذلك عن طريق تحديد دالة خسارة بنفسك إذا كنت تريد ذلك!
+
+</Tip>
+
+يعمل هذا النهج بشكل رائع لمجموعات البيانات الصغيرة، ولكن بالنسبة لمجموعات البيانات الأكبر، فقد تجد أنه يصبح مشكلة. لماذا؟
+لأن المصفوفة المرمزة والتصنيفات يجب أن يتم تحميلها بالكامل في الذاكرة، ولأن NumPy لا يتعامل مع
+المصفوفات"غير المنتظمة"، لذا حشو كل عينة  إلى طول أطول عينة في مجموعة البيانات بأكملها. سيؤدي ذلك إلى زيادة حجم المصفوفة لديك، وستبطئ الرموز الزائده من عملية التدريب أيضًا!
+
+### تحميل البيانات كـ tf.data.Dataset
+
+إذا كنت تريد تجنب إبطاء التدريب، فيمكنك تحميل بياناتك كـ `tf.data.Dataset` بدلاً من ذلك. على الرغم من أنه يمكنك كتابة خط أنابيب `tf.data` الخاص بك إذا كنت تريد، إلا أن لدينا طريقتين مختصرتين للقيام بذلك:
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: هذه هي الطريقة التي نوصي بها في معظم الحالات. نظرًا لأنه طريقة
+على نموذجك، فيمكنه فحص النموذج لتحديد الأعمدة القابلة للاستخدام كمدخلات للنموذج تلقائيًا،
+واستبعاد الأعمدة الأخرى لإنشاء مجموعة بيانات أبسط وأكثر كفاءة.
+- [`~datasets.Dataset.to_tf_dataset`]: هذه الطريقة أكثر أساسية، وهي مفيدة عندما تريد التحكم بدقة في كيفية
+إنشاء مجموعة البيانات الخاصة بك، عن طريق تحديد أعمدة `columns` و `label_cols` المحددة التي سيتم تضمينها.
+
+قبل أن تتمكن من استخدام [`~TFPreTrainedModel.prepare_tf_dataset`]، ستحتاج إلى إضافة مخرجات المُجزئ إلى مجموعة البيانات الخاصة بك كأعمدة، كما هو موضح في
+عينة التعليمات البرمجية التالية:
+
+```py
+def tokenize_dataset (data):
+# ستتم إضافة مفاتيح القاموس الذي تمت إعادته كأعمدة إلى مجموعة البيانات
+return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+تذكر أن مجموعات بيانات Hugging Face يتم تخزينها على القرص بشكل افتراضي، لذا فلن يؤدي ذلك إلى تضخيم استخدام الذاكرة لديك! بمجرد إضافة الأعمدة، يمكنك بث الدفعات من مجموعة البيانات وإضافة الترميز إلى كل دفعة، مما يقلل بشكل كبير من عدد رموز الترقيم مقارنة بترميز مجموعة البيانات بأكملها.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+لاحظ أنه في عينة التعليمات البرمجية أعلاه، تحتاج إلى تمرير المُجزئ اللغوي إلى `prepare_tf_dataset` حتى تتمكن من حشو الدُفعات بشكل صحيح أثناء تحميلها.
+إذا كانت جميع العينات في مجموعة البيانات الخاصة بك بنفس الطول ولم يكن الترميز ضروريًا، فيمكنك تخطي هذا المعامل.
+إذا كنت بحاجة إلى القيام بشيء أكثر تعقيدًا من مجرد ترميز العينات (على سبيل المثال، إفساد الرموز للنمذجة اللغوية المُقنعة)،
+فيمكنك استخدام معامل `collate_fn` بدلاً من ذلك لتمرير دالة يتم استدعاؤها لتحويل
+قائمة العينات إلى دفعة وتطبيق أي معالجة مسبقة تريدها. راجع أمثلةنا [examples](https://github.com/huggingface/transformers/tree/main/examples) أو
+[دفاتر الملاحظات](https://huggingface.co/docs/transformers/notebooks) لرؤية هذا النهج في العمل.
+
+بمجرد إنشاء `tf.data.Dataset`، يمكنك تجميع النموذج وتناسبه كما هو الحال من قبل:
+
+```py
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+## تدريب في PyTorch الأصلي
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] يهتم بحلقة التدريب ويسمح لك بضبط نموذج في سطر واحد من التعليمات البرمجية. بالنسبة للمستخدمين الذين يفضلون كتابة حلقة التدريب الخاصة بهم، يمكنك أيضًا ضبط نموذج 🤗 Transformers في PyTorch الأصلي.
+
+في هذه المرحلة، قد تحتاج إلى إعادة تشغيل دفتر الملاحظات الخاص بك أو تنفيذ التعليمات البرمجية التالية لتحرير بعض الذاكرة:
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+بعد ذلك، قم بمعالجة `tokenized_dataset` يدويًا لإعداده للتدريب.
+
+1. إزالة عمود `text` لأن النموذج لا يقبل النص الخام كإدخال:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. إعادة تسمية عمود `label` إلى `labels` لأن النموذج يتوقع أن يكون الاسم `labels`:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label"، "labels")
+    ```
+
+3. قم بتعيين تنسيق مجموعة البيانات لإرجاع مؤشرات PyTorch بدلاً من القوائم:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+بعد ذلك، قم بإنشاء مجموعة فرعية أصغر من مجموعة البيانات كما هو موضح سابقًا لتسريع الضبط الدقيق:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+قم بإنشاء `DataLoader` لمجموعات بيانات التدريب والاختبار الخاصة بك حتى تتمكن من التكرار عبر دفعات البيانات:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset، shuffle=True، batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset، batch_size=8)
+```
+
+قم بتحميل نموذجك مع عدد التصنيفات المتوقعة:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased"، num_labels=5)
+```
+
+### المحسن ومخطط معدل التعلم
+
+قم بإنشاء محسن ومخطط معدل تعلم لضبط النموذج الدقيق. دعنا نستخدم [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) المحسن من PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters()، lr=5e-5)
+```
+
+قم بإنشاء مخطط معدل التعلم الافتراضي من [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear"، optimizer=optimizer، num_warmup_steps=0، num_training_steps=num_training_steps
+... )
+```
+
+أخيرًا، حدد `device` لاستخدام وحدة معالجة الرسومات (GPU) إذا كان لديك حق الوصول إليها. وإلا، فقد يستغرق التدريب على وحدة المعالجة المركزية (CPU) عدة ساعات بدلاً من دقائق قليلة.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+احصل على وصول مجاني إلى وحدة معالجة رسومات سحابية إذا لم يكن لديك واحدة مع دفتر ملاحظات مستضاف مثل [Colaboratory](https://colab.research.google.com/) أو [SageMaker StudioLab](https://studiolab.sagemaker.aws/).
+
+</Tip>
+
+رائع، الآن أنت مستعد للتدريب! 🥳 
+
+### حلقة التدريب
+
+لمراقبة تقدم التدريب الخاص بك، استخدم مكتبة [tqdm](https://tqdm.github.io/) لإضافة شريط تقدم فوق عدد خطوات التدريب:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k، v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### تقييم
+
+تمامًا كما أضفت وظيفة تقييم إلى [`Trainer`]]، تحتاج إلى القيام بنفس الشيء عندما تكتب حلقة التدريب الخاصة بك. ولكن بدلاً من حساب الإبلاغ عن المقياس في نهاية كل حقبة، هذه المرة ستقوم بتجميع جميع الدفعات باستخدام [`~evaluate.add_batch`] وحساب المقياس في النهاية.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k، v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits، dim=-1)
+...     metric.add_batch(predictions=predictions، references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## موارد إضافية
+
+لمزيد من الأمثلة على الضبط الدقيق، راجع:
+
+- [🤗 أمثلة المحولات](https://github.com/huggingface/transformers/tree/main/examples) تتضمن
+  النصوص البرمجية لتدريب مهام NLP الشائعة في PyTorch وTensorFlow.
+
+- [🤗 دفاتر ملاحظات المحولات](notebooks) يحتوي على دفاتر ملاحظات مختلفة حول كيفية ضبط نموذج لمهمة محددة في PyTorch وTensorFlow.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/ar/troubleshooting.md b/docs/transformers/docs/source/ar/troubleshooting.md
new file mode 100644
index 0000000000000000000000000000000000000000..7874a9fad13304a859967b79d9117da8dc34a545
--- /dev/null
+++ b/docs/transformers/docs/source/ar/troubleshooting.md
@@ -0,0 +1,171 @@
+# استكشاف الأخطاء وإصلاحها
+
+تحدث الأخطاء أحيانًا، لكننا هنا للمساعدة! يغطي هذا الدليل بعض المشكلات الأكثر شيوعًا التي واجهناها وكيفية حلها. مع ذلك، لا يُقصد بهذا الدليل أن يكون مجموعة شاملة لكل مشكلات 🤗 Transformers. لمزيد من المساعدة في استكشاف مشكلتك وإصلاحها، جرب ما يلي:
+<Youtube id="S2EEG3JIt2A"/>
+
+
+1. اطلب المساعدة على [المنتديات](https://discuss.huggingface.co/). هناك فئات محددة يمكنك نشر سؤالك فيها، مثل [المبتدئين](https://discuss.huggingface.co/c/beginners/5) أو [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). تأكد من كتابة منشور جيد وواضح على المنتدى مع بعض التعليمات البرمجية القابلة للتكرار لزيادة احتمالية حل مشكلتك!
+<Youtube id="_PAli-V4wj0"/>
+
+2. قم بإنشاء [مشكلة](https://github.com/huggingface/transformers/issues/new/choose) في مستودع 🤗 Transformers إذا كانت هناك مشكلة متعلقة بالمكتبة. حاول تضمين أكبر قدر ممكن من المعلومات التي تصف المشكلة لمساعدتنا في معرفة ما هو الخطأ وكيفية إصلاحه.
+
+3. تحقق من دليل [الترحيل](migration) إذا كنت تستخدم إصدارًا أقدم من مكتبة 🤗 Transformers حيث تم إدخال بعض التغييرات المهمة بين الإصدارات.
+
+
+للحصول على مزيد من التفاصيل حول استكشاف الأخطاء وإصلاحها والحصول على المساعدة، راجع [الفصل 8](https://huggingface.co/course/chapter8/1?fw=pt) من دورة Hugging Face.
+
+## بيئات جدار الحماية
+
+بعض وحدات معالجة الرسومات (GPU) على السحابة وإعدادات الشبكة الداخلية محمية بجدار حماية من الاتصالات الخارجية، مما يؤدي إلى حدوث خطأ في الاتصال. عندما تحاول تعليمات البرنامج النصي تنزيل أوزان النموذج أو مجموعات البيانات، سيتوقف التنزيل ثم ينتهي بخطأ مثل:
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+في هذه الحالة، يجب محاولة تشغيل 🤗 Transformers في [وضع عدم الاتصال](installation#offline-mode) لتجنب خطأ الاتصال.
+
+## CUDA نفاد الذاكرة
+
+يمكن أن يكون تدريب النماذج الكبيرة التي تحتوي على ملايين المعلمات أمرًا صعبًا بدون الأجهزة المناسبة. أحد الأخطاء الشائعة التي قد تواجهها عند نفاد ذاكرة GPU هو:
+
+```
+CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch)
+```
+
+فيما يلي بعض الحلول المحتملة التي يمكنك تجربتها لتقليل استخدام الذاكرة:
+
+- قلل من قيمة [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) في [`TrainingArguments`].
+
+- حاول استخدام [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) في [`TrainingArguments`] لزيادة حجم الدُفعة بشكل فعال.
+
+<Tip>
+راجع دليل [الأداء](performance) لمزيد من التفاصيل حول تقنيات توفير الذاكرة.
+</Tip>
+
+## عدم القدرة على تحميل نموذج TensorFlow محفوظ
+
+تقوم طريقة TensorFlow [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) بحفظ النموذج بالكامل - الهندسة المعمارية، الأوزان، تكوين التدريب - في ملف واحد. ومع ذلك، عند تحميل ملف النموذج مرة أخرى، قد تواجه خطأ لأن مكتبة 🤗 Transformers قد لا تقوم بتحميل جميع الكائنات المتعلقة بـ TensorFlow في ملف النموذج. لتجنب المشكلات  المتعلقة بحفظ وتحميل نماذج TensorFlow، نوصي بما يلي:
+
+- احفظ أوزان النموذج كملف `h5` باستخدام [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) ثم أعد تحميل النموذج باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```python
+>>> from transformers import TFPreTrainedModel
+>>> from tensorflow import keras
+
+>>> model.save_weights("some_folder/tf_model.h5")
+>>> model = TFPreTrainedModel.from_pretrained("some_folder")
+```
+
+- احفظ النموذج باستخدام [`~TFPretrainedModel.save_pretrained`] وقم بتحميله مرة أخرى باستخدام [`~TFPreTrainedModel.from_pretrained`]:
+
+```python
+>>> from transformers import TFPreTrainedModel
+
+>>> model.save_pretrained("path_to/model")
+>>> model = TFPreTrainedModel.from_pretrained("path_to/model")
+```
+
+## ImportError
+
+خطأ شائع آخر قد تواجهه، خاصة إذا كان نموذجًا تم إصداره حديثًا، هو `ImportError`:
+
+```
+ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location)
+```
+
+بالنسبة لأنواع الأخطاء هذه، تحقق من أن لديك أحدث إصدار من مكتبة Hugging Face Transformers مثبتًا للوصول إلى أحدث النماذج:
+
+```bash
+pip install transformers --upgrade
+```
+
+## خطأ CUDA: تم تشغيل التأكيد على جانب الجهاز
+
+في بعض الأحيان، قد تواجه خطأ CUDA عامًا حول خطأ في كود الجهاز.
+
+```
+RuntimeError: CUDA error: device-side assert triggered
+```
+
+يجب عليك محاولة تشغيل الكود على وحدة المعالجة المركزية (CPU) أولاً للحصول على رسالة خطأ أكثر دقة. أضف متغير البيئة التالي في بداية كودك للتبديل إلى وحدة المعالجة المركزية:
+
+```python
+>>> import os
+
+>>> os.environ["CUDA_VISIBLE_DEVICES"] = ""
+```
+
+الخيار الآخر هو الحصول على تتبع مكدس أفضل من GPU. أضف متغير البيئة التالي في بداية كودك للحصول على تتبع المكدس للإشارة إلى مصدر الخطأ:
+
+```python
+>>> import os
+
+>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+```
+
+## إخراج غير صحيح عند عدم إخفاء رموز الحشو
+
+في بعض الحالات، قد يكون `hidden_state` غير صحيحة إذا تضمنت `input_ids` رموز حشو. ولإثبات ذلك، قم بتحميل نموذج ومجزىء لغوى. يمكنك الوصول إلى `pad_token_id` للنموذج لمعرفة قيمته. قد تكون `pad_token_id` `None` لبعض النماذج، ولكن يمكنك دائمًا تعيينها يدويًا.
+
+```python
+>>> from transformers import AutoModelForSequenceClassification
+>>> import torch
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")
+>>> model.config.pad_token_id
+0
+```
+
+يوضح المثال التالي المُخرجات بدون إخفاء رموز الحشو:
+
+```python
+>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+[ 0.1317, -0.1683]], grad_fn=<AddmmBackward0>)
+```
+
+هنا المُخرجات الفعلية للتسلسل الثاني:
+
+```python
+>>> input_ids = torch.tensor([[7592]])
+>>> output = model(input_ids)
+>>> print(output.logits)
+tensor([[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+يجب عليك في معظم الوقت توفير `attention_mask` للنموذج لتجاهل رموز الحشو لتجنب هذا الخطأ الصامت. الآن يتطابق مُخرجات التسلسل الثاني مع مُخرجاته الفعلية:
+
+<Tip>
+بشكل افتراضي، ينشئ مجزىء النصوص `attention_mask` لك استنادًا إلى إعدادات المجزىء المحدد.
+</Tip>
+
+```python
+>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
+>>> output = model(input_ids, attention_mask=attention_mask)
+>>> print(output.logits)
+tensor([[ 0.0082, -0.2307],
+[-0.1008, -0.4061]], grad_fn=<AddmmBackward0>)
+```
+
+لا ينشئ 🤗 Transformers تلقائيًا `attention_mask` لإخفاء رمز الحشو إذا تم توفيره لأن:
+
+- بعض النماذج ليس لها رمز حشو.
+
+- بالنسبة لبعض الاستخدامات، يريد المستخدمون أن ينتبه النموذج إلى رمز الحشو.
+## ValueError: فئة التكوين غير المعترف بها XYZ لهذا النوع من AutoModel
+
+بشكل عام، نوصي باستخدام فئة [`AutoModel`] لتحميل النسخ المدربة مسبقًا من النماذج. يمكن لهذه الفئة أن تستنتج وتُحمل تلقائيًا البنية الصحيحة من نسخ معينة بناءً على التكوين. إذا رأيت هذا الخطأ `ValueError` عند تحميل نموذج من نسخة، فهذا يعني أن الفئة التلقائية (Auto) لم تتمكن من العثور على خريطة من التكوين في نقطة التفتيش المعطاة إلى نوع النموذج الذي تُحاول تحميله. وغالبًا ما يحدث هذا عندما لا تدعم نقطة التفتيش مهمة معينة.
+
+على سبيل المثال، سترى هذا الخطأ في المثال التالي لأنه لا يوجد GPT2 للإجابة على الأسئلة:
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium")
+>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium")
+ValueError: Unrecognized configuration class <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> for this kind of AutoModel: AutoModelForQuestionAnswering.
+Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ...
+```
diff --git a/docs/transformers/docs/source/de/_config.py b/docs/transformers/docs/source/de/_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49e4e4731965a504b8da443c2cd979638cd22bb
--- /dev/null
+++ b/docs/transformers/docs/source/de/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/transformers/docs/source/de/_toctree.yml b/docs/transformers/docs/source/de/_toctree.yml
new file mode 100644
index 0000000000000000000000000000000000000000..df772c6c788f5ffeb9c6bd9f0a81ce3bc8ad82e4
--- /dev/null
+++ b/docs/transformers/docs/source/de/_toctree.yml
@@ -0,0 +1,40 @@
+- sections:
+  - local: index
+    title: 🤗 Transformers
+  - local: quicktour
+    title: Schnellstart
+  - local: installation
+    title: Installation
+  title: Erste Schritte
+- sections:
+  - local: pipeline_tutorial
+    title: Pipelines für Inferenzen
+  - local: autoclass_tutorial
+    title: Laden von vortrainierten Instanzen mit einer AutoClass
+  - local: preprocessing
+    title: Vorverarbeiten
+  - local: training
+    title: Optimierung eines vortrainierten Modells
+  - local: run_scripts
+    title: Trainieren mit einem Skript
+  - local: accelerate
+    title: Verteiltes Training mit 🤗 Accelerate
+  - local: peft
+    title: Laden und Trainieren von Adaptern mit 🤗 PEFT
+  - local: model_sharing
+    title: Ein Modell teilen
+  - local: llm_tutorial
+    title: Generation with LLMs
+  title: Tutorials
+- sections:
+  - local: contributing
+    title: Wie kann man zu 🤗 Transformers beitragen?
+  - local: add_new_model
+    title: Wie fügt man ein Modell zu 🤗 Transformers hinzu?
+  - local: add_new_pipeline
+    title: Wie fügt man eine Pipeline zu 🤗 Transformers hinzu?
+  - local: testing
+    title: Testen
+  - local: pr_checks
+    title: Überprüfung einer Pull Request
+  title: Contribute
diff --git a/docs/transformers/docs/source/de/accelerate.md b/docs/transformers/docs/source/de/accelerate.md
new file mode 100644
index 0000000000000000000000000000000000000000..98a11cbdc4177170fd5b02c29fc19b36ebf259cb
--- /dev/null
+++ b/docs/transformers/docs/source/de/accelerate.md
@@ -0,0 +1,136 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Verteiltes Training mit 🤗 Accelerate
+
+Da die Modelle immer größer werden, hat sich die Parallelität als Strategie zum Trainieren größerer Modelle auf begrenzter Hardware und zur Beschleunigung der Trainingsgeschwindigkeit um mehrere Größenordnungen erwiesen. Bei Hugging Face haben wir die Bibliothek [🤗 Accelerate](https://huggingface.co/docs/accelerate) entwickelt, um Nutzern zu helfen, ein 🤗 Transformers-Modell auf jeder Art von verteiltem Setup zu trainieren, egal ob es sich um mehrere GPUs auf einer Maschine oder mehrere GPUs auf mehreren Maschinen handelt. In diesem Tutorial lernen Sie, wie Sie Ihre native PyTorch-Trainingsschleife anpassen, um das Training in einer verteilten Umgebung zu ermöglichen.
+
+## Einrichtung
+
+Beginnen Sie mit der Installation von 🤗 Accelerate:
+
+```bash
+pip install accelerate
+```
+
+Dann importieren und erstellen Sie ein [`~accelerate.Accelerator`]-Objekt. Der [`~accelerate.Accelerator`] wird automatisch Ihre Art der verteilten Einrichtung erkennen und alle notwendigen Komponenten für das Training initialisieren. Sie müssen Ihr Modell nicht explizit auf einem Gerät platzieren.
+
+```py
+>>> from accelerate import Accelerator
+
+>>> accelerator = Accelerator()
+```
+
+## Vorbereiten auf die Beschleunigung
+
+Der nächste Schritt ist die Übergabe aller relevanten Trainingsobjekte an die Methode [`~accelerate.Accelerator.prepare`]. Dazu gehören Ihre Trainings- und Evaluierungs-DataLoader, ein Modell und ein Optimierer:
+
+```py
+>>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+...     train_dataloader, eval_dataloader, model, optimizer
+... )
+```
+
+## Rückwärts
+
+Die letzte Ergänzung besteht darin, das typische `loss.backward()` in der Trainingsschleife durch die 🤗 Accelerate-Methode [`~accelerate.Accelerator.backward`] zu ersetzen:
+
+```py
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         accelerator.backward(loss)
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+Wie Sie im folgenden Code sehen können, müssen Sie nur vier zusätzliche Codezeilen zu Ihrer Trainingsschleife hinzufügen, um verteiltes Training zu ermöglichen!
+
+```diff
++ from accelerate import Accelerator
+  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
+
++ accelerator = Accelerator()
+
+  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
+  optimizer = AdamW(model.parameters(), lr=3e-5)
+
+- device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+- model.to(device)
+
++ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
++     train_dataloader, eval_dataloader, model, optimizer
++ )
+
+  num_epochs = 3
+  num_training_steps = num_epochs * len(train_dataloader)
+  lr_scheduler = get_scheduler(
+      "linear",
+      optimizer=optimizer,
+      num_warmup_steps=0,
+      num_training_steps=num_training_steps
+  )
+
+  progress_bar = tqdm(range(num_training_steps))
+
+  model.train()
+  for epoch in range(num_epochs):
+      for batch in train_dataloader:
+-         batch = {k: v.to(device) for k, v in batch.items()}
+          outputs = model(**batch)
+          loss = outputs.loss
+-         loss.backward()
++         accelerator.backward(loss)
+
+          optimizer.step()
+          lr_scheduler.step()
+          optimizer.zero_grad()
+          progress_bar.update(1)
+```
+
+## Trainieren
+
+Sobald Sie die entsprechenden Codezeilen hinzugefügt haben, starten Sie Ihr Training in einem Skript oder einem Notebook wie Colaboratory.
+
+### Trainieren mit einem Skript
+
+Wenn Sie Ihr Training mit einem Skript durchführen, führen Sie den folgenden Befehl aus, um eine Konfigurationsdatei zu erstellen und zu speichern:
+
+```bash
+accelerate config
+```
+
+Dann starten Sie Ihr Training mit:
+
+```bash
+accelerate launch train.py
+```
+
+### Trainieren mit einem Notebook
+
+🤗 Accelerate kann auch in einem Notebook laufen, wenn Sie planen, die TPUs von Colaboratory zu verwenden. Verpacken Sie den gesamten Code, der für das Training verantwortlich ist, in eine Funktion und übergeben Sie diese an [`~accelerate.notebook_launcher`]:
+
+```py
+>>> from accelerate import notebook_launcher
+
+>>> notebook_launcher(training_function)
+```
+
+Weitere Informationen über 🤗 Accelerate und seine umfangreichen Funktionen finden Sie in der [Dokumentation](https://huggingface.co/docs/accelerate).
\ No newline at end of file
diff --git a/docs/transformers/docs/source/de/add_new_model.md b/docs/transformers/docs/source/de/add_new_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c8987f44254bc1acc59396d1d282781c7d6126e
--- /dev/null
+++ b/docs/transformers/docs/source/de/add_new_model.md
@@ -0,0 +1,891 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wie kann ich ein Modell zu 🤗 Transformers hinzufügen?
+
+Die 🤗 Transformers-Bibliothek ist dank der Beiträge der Community oft in der Lage, neue Modelle anzubieten. Aber das kann ein anspruchsvolles Projekt sein und erfordert eine eingehende Kenntnis der 🤗 Transformers-Bibliothek und des zu implementierenden Modells. Bei Hugging Face versuchen wir, mehr Mitgliedern der Community die Möglichkeit zu geben, aktiv Modelle hinzuzufügen, und wir haben diese Anleitung zusammengestellt, die Sie durch den Prozess des Hinzufügens eines PyTorch-Modells führt (stellen Sie sicher, dass Sie [PyTorch installiert haben](https://pytorch.org/get-started/locally/)).
+
+Auf dem Weg dorthin, werden Sie:
+
+- Einblicke in bewährte Open-Source-Verfahren erhalten
+- die Konstruktionsprinzipien hinter einer der beliebtesten Deep-Learning-Bibliotheken verstehen
+- lernen Sie, wie Sie große Modelle effizient testen können
+- lernen Sie, wie Sie Python-Hilfsprogramme wie `black`, `ruff` und `make fix-copies` integrieren, um sauberen und lesbaren Code zu gewährleisten
+
+Ein Mitglied des Hugging Face-Teams wird Ihnen dabei zur Seite stehen, damit Sie nicht alleine sind. 🤗 ❤️
+
+Um loszulegen, öffnen Sie eine [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) Ausgabe für das Modell, das Sie in 🤗 Transformers sehen möchten. Wenn Sie nicht besonders wählerisch sind, wenn es darum geht, ein bestimmtes Modell beizusteuern, können Sie nach dem [New model label](https://github.com/huggingface/transformers/labels/New%20model) filtern, um zu sehen, ob es noch unbeanspruchte Modellanfragen gibt, und daran arbeiten.
+
+Sobald Sie eine neue Modellanfrage eröffnet haben, sollten Sie sich zunächst mit 🤗 Transformers vertraut machen, falls Sie das noch nicht sind!
+
+## Allgemeiner Überblick über 🤗 Transformers
+
+Zunächst sollten Sie sich einen allgemeinen Überblick über 🤗 Transformers verschaffen. 🤗 Transformers ist eine sehr meinungsfreudige Bibliothek, es ist also möglich, dass
+Es besteht also die Möglichkeit, dass Sie mit einigen der Philosophien oder Designentscheidungen der Bibliothek nicht einverstanden sind. Aus unserer Erfahrung heraus haben wir jedoch
+dass die grundlegenden Designentscheidungen und Philosophien der Bibliothek entscheidend sind, um 🤗 Transformers effizient zu skalieren.
+Transformatoren zu skalieren und gleichzeitig die Wartungskosten auf einem vernünftigen Niveau zu halten.
+
+Ein guter erster Ansatzpunkt, um die Bibliothek besser zu verstehen, ist die Lektüre der [Dokumentation unserer Philosophie](Philosophie). Als Ergebnis unserer Arbeitsweise gibt es einige Entscheidungen, die wir versuchen, auf alle Modelle anzuwenden:
+
+- Komposition wird im Allgemeinen gegenüber Abstraktion bevorzugt
+- Die Duplizierung von Code ist nicht immer schlecht, wenn sie die Lesbarkeit oder Zugänglichkeit eines Modells stark verbessert
+- Modelldateien sind so in sich geschlossen wie möglich, so dass Sie, wenn Sie den Code eines bestimmten Modells lesen, idealerweise nur
+  in die entsprechende Datei `modeling_....py` schauen müssen.
+
+Unserer Meinung nach ist der Code der Bibliothek nicht nur ein Mittel, um ein Produkt bereitzustellen, *z.B.* die Möglichkeit, BERT für
+Inferenz zu verwenden, sondern auch als das Produkt selbst, das wir verbessern wollen. Wenn Sie also ein Modell hinzufügen, ist der Benutzer nicht nur die
+Person, die Ihr Modell verwenden wird, sondern auch jeder, der Ihren Code liest, zu verstehen versucht und ihn möglicherweise verbessert.
+
+Lassen Sie uns daher ein wenig tiefer in das allgemeine Design der Bibliothek einsteigen.
+
+### Überblick über die Modelle
+
+Um ein Modell erfolgreich hinzuzufügen, ist es wichtig, die Interaktion zwischen Ihrem Modell und seiner Konfiguration zu verstehen,
+[`PreTrainedModel`] und [`PretrainedConfig`]. Als Beispiel werden wir
+das Modell, das zu 🤗 Transformers hinzugefügt werden soll, `BrandNewBert` nennen.
+
+Schauen wir uns das mal an:
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>
+
+Wie Sie sehen, machen wir in 🤗 Transformers von der Vererbung Gebrauch, aber wir beschränken die Abstraktionsebene auf ein absolutes Minimum.
+Minimum. Es gibt nie mehr als zwei Abstraktionsebenen für ein Modell in der Bibliothek. `BrandNewBertModel`
+erbt von `BrandNewBertPreTrainedModel`, das wiederum von [`PreTrainedModel`] erbt und
+das war's. In der Regel wollen wir sicherstellen, dass ein neues Modell nur von
+[`PreTrainedModel`] abhängt. Die wichtigen Funktionalitäten, die jedem neuen Modell automatisch zur Verfügung gestellt werden, sind
+Modell automatisch bereitgestellt werden, sind [`~PreTrainedModel.from_pretrained`] und
+[`~PreTrainedModel.save_pretrained`], die für die Serialisierung und Deserialisierung verwendet werden. Alle
+anderen wichtigen Funktionalitäten, wie `BrandNewBertModel.forward` sollten vollständig in der neuen
+Skript `modeling_brand_new_bert.py` definiert werden. Als nächstes wollen wir sicherstellen, dass ein Modell mit einer bestimmten Kopfebene, wie z.B.
+`BrandNewBertForMaskedLM` nicht von `BrandNewBertModel` erbt, sondern `BrandNewBertModel` verwendet
+als Komponente, die im Forward Pass aufgerufen werden kann, um die Abstraktionsebene niedrig zu halten. Jedes neue Modell erfordert eine
+Konfigurationsklasse, genannt `BrandNewBertConfig`. Diese Konfiguration wird immer als ein Attribut in
+[PreTrainedModel] gespeichert und kann daher über das Attribut `config` für alle Klassen aufgerufen werden
+die von `BrandNewBertPreTrainedModel` erben:
+
+```python
+model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
+model.config  # model has access to its config
+```
+
+Ähnlich wie das Modell erbt die Konfiguration grundlegende Serialisierungs- und Deserialisierungsfunktionalitäten von
+[`PretrainedConfig`]. Beachten Sie, dass die Konfiguration und das Modell immer in zwei verschiedene Formate serialisiert werden
+unterschiedliche Formate serialisiert werden - das Modell in eine *pytorch_model.bin* Datei und die Konfiguration in eine *config.json* Datei. Aufruf von
+[`~PreTrainedModel.save_pretrained`] wird automatisch
+[`~PretrainedConfig.save_pretrained`] auf, so dass sowohl das Modell als auch die Konfiguration gespeichert werden.
+
+
+### Code-Stil
+
+Wenn Sie Ihr neues Modell kodieren, sollten Sie daran denken, dass Transformers eine Bibliothek mit vielen Meinungen ist und dass wir selbst ein paar Macken haben
+wie der Code geschrieben werden sollte :-)
+
+1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
+   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
+   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
+2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
+   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
+   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
+3. Generell ziehen wir längeren expliziten Code einem kurzen magischen Code vor.
+4. Vermeiden Sie die Unterklassifizierung von `nn.Sequential` in PyTorch, sondern unterklassifizieren Sie `nn.Module` und schreiben Sie den Vorwärtspass, so dass jeder
+   so dass jeder, der Ihren Code verwendet, ihn schnell debuggen kann, indem er Druckanweisungen oder Haltepunkte hinzufügt.
+5. Ihre Funktionssignatur sollte mit einer Typ-Annotation versehen sein. Im Übrigen sind gute Variablennamen viel lesbarer und verständlicher
+   verständlicher als Typ-Anmerkungen.
+
+### Übersicht der Tokenizer
+
+Noch nicht ganz fertig :-( Dieser Abschnitt wird bald hinzugefügt!
+
+## Schritt-für-Schritt-Rezept zum Hinzufügen eines Modells zu 🤗 Transformers
+
+Jeder hat andere Vorlieben, was die Portierung eines Modells angeht. Daher kann es sehr hilfreich sein, wenn Sie sich Zusammenfassungen ansehen
+wie andere Mitwirkende Modelle auf Hugging Face portiert haben. Hier ist eine Liste von Blogbeiträgen aus der Community, wie man ein Modell portiert:
+
+1. [Portierung eines GPT2-Modells](https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28) von [Thomas](https://huggingface.co/thomwolf)
+2. [Portierung des WMT19 MT-Modells](https://huggingface.co/blog/porting-fsmt) von [Stas](https://huggingface.co/stas)
+
+Aus Erfahrung können wir Ihnen sagen, dass die wichtigsten Dinge, die Sie beim Hinzufügen eines Modells beachten müssen, sind:
+
+- Erfinden Sie das Rad nicht neu! Die meisten Teile des Codes, den Sie für das neue 🤗 Transformers-Modell hinzufügen werden, existieren bereits
+  irgendwo in 🤗 Transformers. Nehmen Sie sich etwas Zeit, um ähnliche, bereits vorhandene Modelle und Tokenizer zu finden, die Sie kopieren können
+  von. [grep](https://www.gnu.org/software/grep/) und [rg](https://github.com/BurntSushi/ripgrep) sind Ihre
+  Freunde. Beachten Sie, dass es sehr gut möglich ist, dass der Tokenizer Ihres Modells auf einer Modellimplementierung basiert und
+  und der Modellierungscode Ihres Modells auf einer anderen. *Z.B.* Der Modellierungscode von FSMT basiert auf BART, während der Tokenizer-Code von FSMT
+  auf XLM basiert.
+- Es handelt sich eher um eine technische als um eine wissenschaftliche Herausforderung. Sie sollten mehr Zeit auf die Schaffung einer
+  eine effiziente Debugging-Umgebung zu schaffen, als zu versuchen, alle theoretischen Aspekte des Modells in dem Papier zu verstehen.
+- Bitten Sie um Hilfe, wenn Sie nicht weiterkommen! Modelle sind der Kernbestandteil von 🤗 Transformers, so dass wir bei Hugging Face mehr als
+  mehr als glücklich, Ihnen bei jedem Schritt zu helfen, um Ihr Modell hinzuzufügen. Zögern Sie nicht zu fragen, wenn Sie merken, dass Sie nicht weiterkommen.
+  Fortschritte machen.
+
+Im Folgenden versuchen wir, Ihnen ein allgemeines Rezept an die Hand zu geben, das uns bei der Portierung eines Modells auf 🤗 Transformers am nützlichsten erschien.
+
+Die folgende Liste ist eine Zusammenfassung all dessen, was getan werden muss, um ein Modell hinzuzufügen und kann von Ihnen als To-Do verwendet werden
+Liste verwenden:
+
+☐ (Optional) Verstehen der theoretischen Aspekte des Modells<br>
+☐ Vorbereiten der 🤗 Transformers-Entwicklungsumgebung<br>
+☐ Debugging-Umgebung des ursprünglichen Repositorys eingerichtet<br>
+☐ Skript erstellt, das den Durchlauf `forward()` unter Verwendung des ursprünglichen Repositorys und des Checkpoints erfolgreich durchführt<br>
+☐ Erfolgreich das Modellskelett zu 🤗 Transformers hinzugefügt<br>
+☐ Erfolgreiche Umwandlung des ursprünglichen Prüfpunkts in den 🤗 Transformers-Prüfpunkt<br>
+☐ Erfolgreich den Durchlauf `forward()` in 🤗 Transformers ausgeführt, der eine identische Ausgabe wie der ursprüngliche Prüfpunkt liefert<br>
+☐ Modell-Tests in 🤗 Transformers abgeschlossen<br>
+☐ Erfolgreich Tokenizer in 🤗 Transformers hinzugefügt<br>
+☐ End-to-End-Integrationstests ausgeführt<br>
+☐ Docs fertiggestellt<br>
+☐ Modellgewichte in den Hub hochgeladen<br>
+☐ Die Pull-Anfrage eingereicht<br>
+☐ (Optional) Hinzufügen eines Demo-Notizbuchs
+
+Für den Anfang empfehlen wir in der Regel, mit einem guten theoretischen Verständnis von `BrandNewBert` zu beginnen. Wie auch immer,
+wenn Sie es vorziehen, die theoretischen Aspekte des Modells *on-the-job* zu verstehen, dann ist es völlig in Ordnung, direkt in die
+in die Code-Basis von `BrandNewBert` einzutauchen. Diese Option könnte für Sie besser geeignet sein, wenn Ihre technischen Fähigkeiten besser sind als
+als Ihre theoretischen Fähigkeiten, wenn Sie Schwierigkeiten haben, die Arbeit von `BrandNewBert` zu verstehen, oder wenn Sie einfach Spaß am Programmieren
+mehr Spaß am Programmieren haben als am Lesen wissenschaftlicher Abhandlungen.
+
+### 1. (Optional) Theoretische Aspekte von BrandNewBert
+
+Sie sollten sich etwas Zeit nehmen, um die Abhandlung von *BrandNewBert* zu lesen, falls eine solche Beschreibung existiert. Möglicherweise gibt es große
+Abschnitte des Papiers, die schwer zu verstehen sind. Wenn das der Fall ist, ist das in Ordnung - machen Sie sich keine Sorgen! Das Ziel ist
+ist es nicht, ein tiefes theoretisches Verständnis des Papiers zu erlangen, sondern die notwendigen Informationen zu extrahieren, um
+das Modell effektiv in 🤗 Transformers zu implementieren. Das heißt, Sie müssen nicht zu viel Zeit auf die
+theoretischen Aspekten verbringen, sondern sich lieber auf die praktischen Aspekte konzentrieren, nämlich:
+
+- Welche Art von Modell ist *brand_new_bert*? BERT-ähnliches Modell nur für den Encoder? GPT2-ähnliches reines Decoder-Modell? BART-ähnliches
+  Encoder-Decoder-Modell? Sehen Sie sich die [model_summary](model_summary) an, wenn Sie mit den Unterschieden zwischen diesen Modellen nicht vertraut sind.
+- Was sind die Anwendungen von *brand_new_bert*? Textklassifizierung? Texterzeugung? Seq2Seq-Aufgaben, *z.B.,*
+  Zusammenfassungen?
+- Was ist die neue Eigenschaft des Modells, die es von BERT/GPT-2/BART unterscheidet?
+- Welches der bereits existierenden [🤗 Transformers-Modelle](https://huggingface.co/transformers/#contents) ist am ähnlichsten
+  ähnlich wie *brand_new_bert*?
+- Welche Art von Tokenizer wird verwendet? Ein Satzteil-Tokenisierer? Ein Wortstück-Tokenisierer? Ist es derselbe Tokenisierer, der für
+  für BERT oder BART?
+
+Nachdem Sie das Gefühl haben, einen guten Überblick über die Architektur des Modells erhalten zu haben, können Sie dem
+Hugging Face Team schreiben und Ihre Fragen stellen. Dazu können Fragen zur Architektur des Modells gehören,
+seiner Aufmerksamkeitsebene usw. Wir werden Ihnen gerne weiterhelfen.
+
+### 2. Bereiten Sie als nächstes Ihre Umgebung vor
+
+1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf der Seite des Repositorys auf die Schaltfläche 'Fork' klicken.
+   Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes unter Ihrem GitHub-Benutzerkonto erstellt.
+
+2. Klonen Sie Ihren `transformers` Fork auf Ihre lokale Festplatte und fügen Sie das Basis-Repository als Remote hinzu:
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+3. Richten Sie eine Entwicklungsumgebung ein, indem Sie z.B. den folgenden Befehl ausführen:
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Abhängig von Ihrem Betriebssystem und da die Anzahl der optionalen Abhängigkeiten von Transformers wächst, kann es sein, dass Sie bei diesem Befehl einen
+Fehler mit diesem Befehl. Stellen Sie in diesem Fall sicher, dass Sie das Deep Learning Framework, mit dem Sie arbeiten, installieren
+(PyTorch, TensorFlow und/oder Flax) und führen Sie es aus:
+
+```bash
+pip install -e ".[quality]"
+```
+
+was für die meisten Anwendungsfälle ausreichend sein sollte. Sie können dann zum übergeordneten Verzeichnis zurückkehren
+
+```bash
+cd ..
+```
+
+4. Wir empfehlen, die PyTorch-Version von *brand_new_bert* zu Transformers hinzuzufügen. Um PyTorch zu installieren, folgen Sie bitte den
+   Anweisungen auf https://pytorch.org/get-started/locally/.
+
+**Anmerkung:** Sie müssen CUDA nicht installiert haben. Es reicht aus, das neue Modell auf der CPU zum Laufen zu bringen.
+
+5. Um *brand_new_bert* zu portieren, benötigen Sie außerdem Zugriff auf das Original-Repository:
+
+```bash
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+cd brand_new_bert
+pip install -e .
+```
+
+Jetzt haben Sie eine Entwicklungsumgebung eingerichtet, um *brand_new_bert* auf 🤗 Transformers zu portieren.
+
+### 3.-4. Führen Sie einen Pre-Training-Checkpoint mit dem Original-Repository durch
+
+Zunächst werden Sie mit dem ursprünglichen *brand_new_bert* Repository arbeiten. Oft ist die ursprüngliche Implementierung sehr
+"forschungslastig". Das bedeutet, dass es an Dokumentation mangeln kann und der Code schwer zu verstehen sein kann. Aber das sollte
+genau Ihre Motivation sein, *brand_new_bert* neu zu implementieren. Eines unserer Hauptziele bei Hugging Face ist es, *die Menschen dazu zu bringen
+auf den Schultern von Giganten zu stehen*, was sich hier sehr gut darin ausdrückt, dass wir ein funktionierendes Modell nehmen und es umschreiben, um es so
+es so **zugänglich, benutzerfreundlich und schön** wie möglich zu machen. Dies ist die wichtigste Motivation für die Neuimplementierung von
+Modelle in 🤗 Transformers umzuwandeln - der Versuch, komplexe neue NLP-Technologie für **jeden** zugänglich zu machen.
+
+Sie sollten damit beginnen, indem Sie in das Original-Repository eintauchen.
+
+Die erfolgreiche Ausführung des offiziellen Pre-Trainingsmodells im Original-Repository ist oft **der schwierigste** Schritt.
+Unserer Erfahrung nach ist es sehr wichtig, dass Sie einige Zeit damit verbringen, sich mit der ursprünglichen Code-Basis vertraut zu machen. Sie müssen
+das Folgende herausfinden:
+
+- Wo finden Sie die vortrainierten Gewichte?
+- Wie lädt man die vorab trainierten Gewichte in das entsprechende Modell?
+- Wie kann der Tokenizer unabhängig vom Modell ausgeführt werden?
+- Verfolgen Sie einen Forward Pass, damit Sie wissen, welche Klassen und Funktionen für einen einfachen Forward Pass erforderlich sind. Normalerweise,
+  müssen Sie nur diese Funktionen reimplementieren.
+- Sie müssen in der Lage sein, die wichtigen Komponenten des Modells zu finden: Wo befindet sich die Klasse des Modells? Gibt es Unterklassen des Modells,
+  *z.B.* EncoderModel, DecoderModel? Wo befindet sich die Selbstaufmerksamkeitsschicht? Gibt es mehrere verschiedene Aufmerksamkeitsebenen,
+  *z.B.* *Selbstaufmerksamkeit*, *Kreuzaufmerksamkeit*...?
+- Wie können Sie das Modell in der ursprünglichen Umgebung des Repo debuggen? Müssen Sie *print* Anweisungen hinzufügen, können Sie
+  mit einem interaktiven Debugger wie *ipdb* arbeiten oder sollten Sie eine effiziente IDE zum Debuggen des Modells verwenden, wie z.B. PyCharm?
+
+Es ist sehr wichtig, dass Sie, bevor Sie mit der Portierung beginnen, den Code im Original-Repository **effizient** debuggen können
+Repository können! Denken Sie auch daran, dass Sie mit einer Open-Source-Bibliothek arbeiten, also zögern Sie nicht, ein Problem oder
+oder sogar eine Pull-Anfrage im Original-Repository zu stellen. Die Betreuer dieses Repositorys sind wahrscheinlich sehr froh darüber
+dass jemand in ihren Code schaut!
+
+An diesem Punkt liegt es wirklich an Ihnen, welche Debugging-Umgebung und Strategie Sie zum Debuggen des ursprünglichen
+Modell zu debuggen. Wir raten dringend davon ab, eine kostspielige GPU-Umgebung einzurichten, sondern arbeiten Sie einfach auf einer CPU, sowohl wenn Sie mit dem
+in das ursprüngliche Repository einzutauchen und auch, wenn Sie beginnen, die 🤗 Transformers-Implementierung des Modells zu schreiben. Nur
+ganz am Ende, wenn das Modell bereits erfolgreich auf 🤗 Transformers portiert wurde, sollte man überprüfen, ob das
+Modell auch auf der GPU wie erwartet funktioniert.
+
+Im Allgemeinen gibt es zwei mögliche Debugging-Umgebungen für die Ausführung des Originalmodells
+
+- [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
+- Lokale Python-Skripte.
+
+Jupyter-Notebooks haben den Vorteil, dass sie eine zellenweise Ausführung ermöglichen, was hilfreich sein kann, um logische Komponenten besser voneinander zu trennen und
+logische Komponenten voneinander zu trennen und schnellere Debugging-Zyklen zu haben, da Zwischenergebnisse gespeichert werden können. Außerdem,
+Außerdem lassen sich Notebooks oft leichter mit anderen Mitwirkenden teilen, was sehr hilfreich sein kann, wenn Sie das Hugging Face Team um Hilfe bitten möchten.
+Face Team um Hilfe bitten. Wenn Sie mit Jupyter-Notizbüchern vertraut sind, empfehlen wir Ihnen dringend, mit ihnen zu arbeiten.
+
+Der offensichtliche Nachteil von Jupyter-Notizbüchern ist, dass Sie, wenn Sie nicht daran gewöhnt sind, mit ihnen zu arbeiten, einige Zeit damit verbringen müssen
+einige Zeit damit verbringen müssen, sich an die neue Programmierumgebung zu gewöhnen, und dass Sie möglicherweise Ihre bekannten Debugging-Tools nicht mehr verwenden können
+wie z.B. `ipdb` nicht mehr verwenden können.
+
+Für jede Codebasis ist es immer ein guter erster Schritt, einen **kleinen** vortrainierten Checkpoint zu laden und in der Lage zu sein, einen
+einzelnen Vorwärtsdurchlauf mit einem Dummy-Integer-Vektor von Eingabe-IDs als Eingabe zu reproduzieren. Ein solches Skript könnte wie folgt aussehen (in
+Pseudocode):
+
+```python
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.predict(input_ids)
+```
+
+Was die Debugging-Strategie anbelangt, so können Sie im Allgemeinen aus mehreren Strategien wählen:
+
+- Zerlegen Sie das ursprüngliche Modell in viele kleine testbare Komponenten und führen Sie für jede dieser Komponenten einen Vorwärtsdurchlauf zur
+  Überprüfung
+- Zerlegen Sie das ursprüngliche Modell nur in den ursprünglichen *Tokenizer* und das ursprüngliche *Modell*, führen Sie einen Vorwärtsdurchlauf für diese Komponenten durch
+  und verwenden Sie dazwischenliegende Druckanweisungen oder Haltepunkte zur Überprüfung.
+
+Auch hier bleibt es Ihnen überlassen, welche Strategie Sie wählen. Oft ist die eine oder die andere Strategie vorteilhaft, je nach der ursprünglichen Codebasis
+Basis.
+
+Wenn die ursprüngliche Codebasis es Ihnen erlaubt, das Modell in kleinere Teilkomponenten zu zerlegen, *z.B.* wenn die ursprüngliche
+Code-Basis problemlos im Eager-Modus ausgeführt werden kann, lohnt es sich in der Regel, dies zu tun. Es gibt einige wichtige Vorteile
+am Anfang den schwierigeren Weg zu gehen:
+
+- Wenn Sie später das ursprüngliche Modell mit der Hugging Face-Implementierung vergleichen, können Sie automatisch überprüfen, ob
+  für jede Komponente einzeln überprüfen, ob die entsprechende Komponente der 🤗 Transformers-Implementierung übereinstimmt, anstatt sich auf
+  anstatt sich auf den visuellen Vergleich über Druckanweisungen zu verlassen
+- können Sie das große Problem der Portierung eines Modells in kleinere Probleme der Portierung einzelner Komponenten zerlegen
+  einzelnen Komponenten zu zerlegen und so Ihre Arbeit besser zu strukturieren
+- Die Aufteilung des Modells in logisch sinnvolle Komponenten hilft Ihnen, einen besseren Überblick über das Design des Modells zu bekommen
+  und somit das Modell besser zu verstehen
+- In einem späteren Stadium helfen Ihnen diese komponentenweisen Tests dabei, sicherzustellen, dass keine Regressionen auftreten, während Sie fortfahren
+  Ihren Code ändern
+
+[Lysandre's](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) Integrationstests für ELECTRA
+gibt ein schönes Beispiel dafür, wie dies geschehen kann.
+
+Wenn die ursprüngliche Codebasis jedoch sehr komplex ist oder nur die Ausführung von Zwischenkomponenten in einem kompilierten Modus erlaubt,
+könnte es zu zeitaufwändig oder sogar unmöglich sein, das Modell in kleinere testbare Teilkomponenten zu zerlegen. Ein gutes
+Beispiel ist die [T5's MeshTensorFlow](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) Bibliothek, die sehr komplex ist
+sehr komplex ist und keine einfache Möglichkeit bietet, das Modell in seine Unterkomponenten zu zerlegen. Bei solchen Bibliotheken ist man
+oft auf die Überprüfung von Druckanweisungen angewiesen.
+
+Unabhängig davon, welche Strategie Sie wählen, ist die empfohlene Vorgehensweise oft die gleiche, nämlich dass Sie mit der Fehlersuche in den
+die Anfangsebenen zuerst und die Endebenen zuletzt debuggen.
+
+Es wird empfohlen, dass Sie die Ausgaben der folgenden Ebenen abrufen, entweder durch Druckanweisungen oder Unterkomponentenfunktionen
+Schichten in der folgenden Reihenfolge abrufen:
+
+1. Rufen Sie die Eingabe-IDs ab, die an das Modell übergeben wurden
+2. Rufen Sie die Worteinbettungen ab
+3. Rufen Sie die Eingabe der ersten Transformer-Schicht ab
+4. Rufen Sie die Ausgabe der ersten Transformer-Schicht ab
+5. Rufen Sie die Ausgabe der folgenden n - 1 Transformer-Schichten ab
+6. Rufen Sie die Ausgabe des gesamten BrandNewBert Modells ab
+
+Die Eingabe-IDs sollten dabei aus einem Array von Ganzzahlen bestehen, *z.B.* `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`
+
+Die Ausgaben der folgenden Schichten bestehen oft aus mehrdimensionalen Float-Arrays und können wie folgt aussehen:
+
+```
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+Wir erwarten, dass jedes zu 🤗 Transformers hinzugefügte Modell eine Reihe von Integrationstests besteht, was bedeutet, dass das ursprüngliche
+Modell und die neu implementierte Version in 🤗 Transformers exakt dieselbe Ausgabe liefern müssen, und zwar mit einer Genauigkeit von 0,001!
+Da es normal ist, dass das exakt gleiche Modell, das in verschiedenen Bibliotheken geschrieben wurde, je nach Bibliotheksrahmen eine leicht unterschiedliche Ausgabe liefern kann
+eine leicht unterschiedliche Ausgabe liefern kann, akzeptieren wir eine Fehlertoleranz von 1e-3 (0,001). Es reicht nicht aus, wenn das Modell
+fast das gleiche Ergebnis liefert, sie müssen fast identisch sein. Daher werden Sie sicherlich die Zwischenergebnisse
+Zwischenergebnisse der 🤗 Transformers-Version mehrfach mit den Zwischenergebnissen der ursprünglichen Implementierung von
+*brand_new_bert* vergleichen. In diesem Fall ist eine **effiziente** Debugging-Umgebung des ursprünglichen Repositorys absolut
+wichtig ist. Hier sind einige Ratschläge, um Ihre Debugging-Umgebung so effizient wie möglich zu gestalten.
+
+- Finden Sie den besten Weg, um Zwischenergebnisse zu debuggen. Ist das ursprüngliche Repository in PyTorch geschrieben? Dann sollten Sie
+  dann sollten Sie sich wahrscheinlich die Zeit nehmen, ein längeres Skript zu schreiben, das das ursprüngliche Modell in kleinere Unterkomponenten zerlegt, um
+  Zwischenwerte abzurufen. Ist das ursprüngliche Repository in Tensorflow 1 geschrieben? Dann müssen Sie sich möglicherweise auf die
+  TensorFlow Druckoperationen wie [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) verlassen, um die
+  Zwischenwerte auszugeben. Ist das ursprüngliche Repository in Jax geschrieben? Dann stellen Sie sicher, dass das Modell **nicht jitted** ist, wenn
+  wenn Sie den Vorwärtsdurchlauf ausführen, *z.B.* schauen Sie sich [dieser Link](https://github.com/google/jax/issues/196) an.
+- Verwenden Sie den kleinsten vortrainierten Prüfpunkt, den Sie finden können. Je kleiner der Prüfpunkt ist, desto schneller wird Ihr Debugging-Zyklus
+  wird. Es ist nicht effizient, wenn Ihr vorab trainiertes Modell so groß ist, dass Ihr Vorwärtsdurchlauf mehr als 10 Sekunden dauert.
+  Falls nur sehr große Checkpoints verfügbar sind, kann es sinnvoller sein, ein Dummy-Modell in der neuen
+  Umgebung mit zufällig initialisierten Gewichten zu erstellen und diese Gewichte zum Vergleich mit der 🤗 Transformers-Version
+  Ihres Modells
+- Vergewissern Sie sich, dass Sie den einfachsten Weg wählen, um einen Forward Pass im ursprünglichen Repository aufzurufen. Idealerweise sollten Sie
+  die Funktion im originalen Repository finden, die **nur** einen einzigen Vorwärtspass aufruft, *d.h.* die oft aufgerufen wird
+  Vorhersagen", "Auswerten", "Vorwärts" oder "Aufruf" genannt wird. Sie wollen keine Funktion debuggen, die `forward` aufruft
+  mehrfach aufruft, *z.B.* um Text zu erzeugen, wie `autoregressive_sample`, `generate`.
+- Versuchen Sie, die Tokenisierung vom *Forward*-Pass des Modells zu trennen. Wenn das Original-Repository Beispiele zeigt, bei denen
+  Sie eine Zeichenkette eingeben müssen, dann versuchen Sie herauszufinden, an welcher Stelle im Vorwärtsaufruf die Zeichenketteneingabe in Eingabe-IDs geändert wird
+  geändert wird und beginnen Sie an dieser Stelle. Das könnte bedeuten, dass Sie möglicherweise selbst ein kleines Skript schreiben oder den
+  Originalcode so ändern müssen, dass Sie die ids direkt eingeben können, anstatt eine Zeichenkette einzugeben.
+- Vergewissern Sie sich, dass sich das Modell in Ihrem Debugging-Setup **nicht** im Trainingsmodus befindet, der oft dazu führt, dass das Modell
+  Dies führt häufig zu zufälligen Ergebnissen, da das Modell mehrere Dropout-Schichten enthält. Stellen Sie sicher, dass der Vorwärtsdurchlauf in Ihrer Debugging
+  Umgebung **deterministisch** ist, damit die Dropout-Schichten nicht verwendet werden. Oder verwenden Sie *transformers.utils.set_seed*.
+  wenn sich die alte und die neue Implementierung im selben Framework befinden.
+
+Im folgenden Abschnitt finden Sie genauere Details/Tipps, wie Sie dies für *brand_new_bert* tun können.
+
+### 5.-14. Portierung von BrandNewBert auf 🤗 Transformatoren
+
+Als nächstes können Sie endlich damit beginnen, neuen Code zu 🤗 Transformers hinzuzufügen. Gehen Sie in den Klon Ihres 🤗 Transformers Forks:
+
+```bash
+cd transformers
+```
+
+In dem speziellen Fall, dass Sie ein Modell hinzufügen, dessen Architektur genau mit der Modellarchitektur eines
+Modells übereinstimmt, müssen Sie nur ein Konvertierungsskript hinzufügen, wie in [diesem Abschnitt](#write-a-conversion-script) beschrieben.
+In diesem Fall können Sie einfach die gesamte Modellarchitektur des bereits vorhandenen Modells wiederverwenden.
+
+Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen die Verwendung des folgenden Skripts, um ein Modell hinzuzufügen
+ein bestehendes Modell:
+
+```bash
+transformers-cli add-new-model-like
+```
+
+Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
+
+**Eröffnen Sie einen Pull Request auf dem Haupt-Repositorium huggingface/transformers**
+
+Bevor Sie mit der Anpassung des automatisch generierten Codes beginnen, ist es nun an der Zeit, einen "Work in progress (WIP)" Pull
+Anfrage, *z.B.* "[WIP] Add *brand_new_bert*", in 🤗 Transformers zu öffnen, damit Sie und das Hugging Face Team
+Seite an Seite an der Integration des Modells in 🤗 Transformers arbeiten können.
+
+Sie sollten Folgendes tun:
+
+1. Erstellen Sie eine Verzweigung mit einem beschreibenden Namen von Ihrer Hauptverzweigung
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+2. Bestätigen Sie den automatisch generierten Code:
+
+```bash
+git add .
+git commit
+```
+
+3. Abrufen und zurücksetzen auf die aktuelle Haupt
+
+```bash
+git fetch upstream
+git rebase upstream/main
+```
+
+4. Übertragen Sie die Änderungen auf Ihr Konto mit:
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+5. Wenn Sie zufrieden sind, gehen Sie auf die Webseite Ihrer Abspaltung auf GitHub. Klicken Sie auf "Pull request". Stellen Sie sicher, dass Sie das
+   GitHub-Handle einiger Mitglieder des Hugging Face-Teams als Reviewer hinzuzufügen, damit das Hugging Face-Team über zukünftige Änderungen informiert wird.
+   zukünftige Änderungen benachrichtigt wird.
+
+6. Ändern Sie den PR in einen Entwurf, indem Sie auf der rechten Seite der GitHub-Pull-Request-Webseite auf "In Entwurf umwandeln" klicken.
+
+Vergessen Sie im Folgenden nicht, wenn Sie Fortschritte gemacht haben, Ihre Arbeit zu committen und in Ihr Konto zu pushen, damit sie in der Pull-Anfrage erscheint.
+damit sie in der Pull-Anfrage angezeigt wird. Außerdem sollten Sie darauf achten, dass Sie Ihre Arbeit von Zeit zu Zeit mit dem aktuellen main
+von Zeit zu Zeit zu aktualisieren, indem Sie dies tun:
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+Generell sollten Sie alle Fragen, die Sie in Bezug auf das Modell oder Ihre Implementierung haben, in Ihrem PR stellen und
+in der PR diskutiert/gelöst werden. Auf diese Weise wird das Hugging Face Team immer benachrichtigt, wenn Sie neuen Code einreichen oder
+wenn Sie eine Frage haben. Es ist oft sehr hilfreich, das Hugging Face-Team auf Ihren hinzugefügten Code hinzuweisen, damit das Hugging Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
+Face-Team Ihr Problem oder Ihre Frage besser verstehen kann.
+
+Gehen Sie dazu auf die Registerkarte "Geänderte Dateien", auf der Sie alle Ihre Änderungen sehen, gehen Sie zu einer Zeile, zu der Sie eine Frage stellen möchten
+eine Frage stellen möchten, und klicken Sie auf das "+"-Symbol, um einen Kommentar hinzuzufügen. Wenn eine Frage oder ein Problem gelöst wurde,
+können Sie auf die Schaltfläche "Lösen" des erstellten Kommentars klicken.
+
+Auf dieselbe Weise wird das Hugging Face-Team Kommentare öffnen, wenn es Ihren Code überprüft. Wir empfehlen, die meisten Fragen
+auf GitHub in Ihrem PR zu stellen. Für einige sehr allgemeine Fragen, die für die Öffentlichkeit nicht sehr nützlich sind, können Sie das
+Hugging Face Team per Slack oder E-Mail zu stellen.
+
+**5. Passen Sie den Code der generierten Modelle für brand_new_bert** an.
+
+Zunächst werden wir uns nur auf das Modell selbst konzentrieren und uns nicht um den Tokenizer kümmern. Den gesamten relevanten Code sollten Sie
+finden Sie in den generierten Dateien `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` und
+`src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.
+
+Jetzt können Sie endlich mit dem Programmieren beginnen :). Der generierte Code in
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` wird entweder die gleiche Architektur wie BERT haben, wenn
+wenn es sich um ein reines Encoder-Modell handelt oder BART, wenn es sich um ein Encoder-Decoder-Modell handelt. An diesem Punkt sollten Sie sich daran erinnern, was
+was Sie am Anfang über die theoretischen Aspekte des Modells gelernt haben: *Wie unterscheidet sich das Modell von BERT oder
+BART?*". Implementieren Sie diese Änderungen, was oft bedeutet, dass Sie die *Selbstaufmerksamkeitsschicht*, die Reihenfolge der Normalisierungsschicht usw. ändern müssen.
+Schicht usw... Auch hier ist es oft nützlich, sich die ähnliche Architektur bereits bestehender Modelle in Transformers anzusehen, um ein besseres Gefühl dafür zu bekommen
+ein besseres Gefühl dafür zu bekommen, wie Ihr Modell implementiert werden sollte.
+
+**Beachten Sie**, dass Sie an diesem Punkt nicht sehr sicher sein müssen, dass Ihr Code völlig korrekt oder sauber ist. Vielmehr ist es
+Sie sollten vielmehr eine erste *unbereinigte*, kopierte Version des ursprünglichen Codes in
+src/transformers/models/brand_new_bert/modeling_brand_new_bert.py" hinzuzufügen, bis Sie das Gefühl haben, dass der gesamte notwendige Code
+hinzugefügt wurde. Unserer Erfahrung nach ist es viel effizienter, schnell eine erste Version des erforderlichen Codes hinzuzufügen und
+den Code iterativ mit dem Konvertierungsskript zu verbessern/korrigieren, wie im nächsten Abschnitt beschrieben. Das einzige, was
+zu diesem Zeitpunkt funktionieren muss, ist, dass Sie die 🤗 Transformers-Implementierung von *brand_new_bert* instanziieren können, *d.h.* der
+folgende Befehl sollte funktionieren:
+
+```python
+from transformers import BrandNewBertModel, BrandNewBertConfig
+
+model = BrandNewBertModel(BrandNewBertConfig())
+```
+
+Der obige Befehl erstellt ein Modell gemäß den Standardparametern, die in `BrandNewBertConfig()` definiert sind, mit
+zufälligen Gewichten und stellt damit sicher, dass die `init()` Methoden aller Komponenten funktionieren.
+
+Beachten Sie, dass alle zufälligen Initialisierungen in der Methode `_init_weights` Ihres `BrandnewBertPreTrainedModel` stattfinden sollten.
+Klasse erfolgen sollte. Sie sollte alle Blattmodule in Abhängigkeit von den Variablen der Konfiguration initialisieren. Hier ist ein Beispiel mit der
+BERT `_init_weights` Methode:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
+`Wav2Vec2ForPreTraining` müssen die letzten beiden linearen Schichten die Initialisierung des regulären PyTorch `nn.Linear` haben.
+aber alle anderen sollten eine Initialisierung wie oben verwenden. Dies ist wie folgt kodiert:
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
+`True` für `module.project_q` und `module.project_hid` setzen, stellen wir sicher, dass die benutzerdefinierte Initialisierung, die wir vorgenommen haben, später nicht überschrieben wird,
+die Funktion `_init_weights` nicht auf sie angewendet wird.
+
+**6. Schreiben Sie ein Konvertierungsskript**
+
+Als nächstes sollten Sie ein Konvertierungsskript schreiben, mit dem Sie den Checkpoint, den Sie zum Debuggen von *brand_new_bert* im
+im ursprünglichen Repository in einen Prüfpunkt konvertieren, der mit Ihrer gerade erstellten 🤗 Transformers-Implementierung von
+*brand_new_bert*. Es ist nicht ratsam, das Konvertierungsskript von Grund auf neu zu schreiben, sondern die bereits
+bestehenden Konvertierungsskripten in 🤗 Transformers nach einem Skript zu suchen, das für die Konvertierung eines ähnlichen Modells verwendet wurde, das im
+demselben Framework wie *brand_new_bert* geschrieben wurde. Normalerweise reicht es aus, ein bereits vorhandenes Konvertierungsskript zu kopieren und
+es für Ihren Anwendungsfall leicht anzupassen. Zögern Sie nicht, das Hugging Face Team zu bitten, Sie auf ein ähnliches, bereits vorhandenes
+Konvertierungsskript für Ihr Modell zu finden.
+
+- Wenn Sie ein Modell von TensorFlow nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BERT [hier](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
+- Wenn Sie ein Modell von PyTorch nach PyTorch portieren, ist ein guter Ausgangspunkt das Konvertierungsskript von BART [hier](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)
+
+Im Folgenden werden wir kurz erklären, wie PyTorch-Modelle Ebenengewichte speichern und Ebenennamen definieren. In PyTorch wird der
+Name einer Ebene durch den Namen des Klassenattributs definiert, das Sie der Ebene geben. Lassen Sie uns ein Dummy-Modell in
+PyTorch, das wir `SimpleModel` nennen, wie folgt:
+
+```python
+from torch import nn
+
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+Jetzt können wir eine Instanz dieser Modelldefinition erstellen, die alle Gewichte ausfüllt: `dense`, `intermediate`,
+`layer_norm` mit zufälligen Gewichten. Wir können das Modell ausdrucken, um seine Architektur zu sehen
+
+```python
+model = SimpleModel()
+
+print(model)
+```
+
+Dies gibt folgendes aus:
+
+```
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+Wir können sehen, dass die Ebenennamen durch den Namen des Klassenattributs in PyTorch definiert sind. Sie können die Gewichtswerte
+Werte einer bestimmten Ebene anzeigen lassen:
+
+```python
+print(model.dense.weight.data)
+```
+
+um zu sehen, dass die Gewichte zufällig initialisiert wurden
+
+```
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+Im Konvertierungsskript sollten Sie diese zufällig initialisierten Gewichte mit den genauen Gewichten der
+entsprechenden Ebene im Kontrollpunkt. *Z.B.*
+
+```python
+# retrieve matching layer weights, e.g. by
+# recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+Dabei müssen Sie sicherstellen, dass jedes zufällig initialisierte Gewicht Ihres PyTorch-Modells und sein entsprechendes
+Checkpoint-Gewicht in **Form und Name** genau übereinstimmen. Zu diesem Zweck ist es **notwendig**, assert
+Anweisungen für die Form hinzuzufügen und die Namen der Checkpoint-Gewichte auszugeben. Sie sollten z.B. Anweisungen hinzufügen wie:
+
+```python
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+```
+
+Außerdem sollten Sie die Namen der beiden Gewichte ausdrucken, um sicherzustellen, dass sie übereinstimmen, *z.B.*.
+
+```python
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+Wenn entweder die Form oder der Name nicht übereinstimmt, haben Sie wahrscheinlich das falsche Kontrollpunktgewicht einer zufällig
+Ebene der 🤗 Transformers-Implementierung zugewiesen.
+
+Eine falsche Form ist höchstwahrscheinlich auf eine falsche Einstellung der Konfigurationsparameter in `BrandNewBertConfig()` zurückzuführen, die
+nicht genau mit denen übereinstimmen, die für den zu konvertierenden Prüfpunkt verwendet wurden. Es könnte aber auch sein, dass
+die PyTorch-Implementierung eines Layers erfordert, dass das Gewicht vorher transponiert wird.
+
+Schließlich sollten Sie auch überprüfen, ob **alle** erforderlichen Gewichte initialisiert sind und alle Checkpoint-Gewichte ausgeben, die
+die nicht zur Initialisierung verwendet wurden, um sicherzustellen, dass das Modell korrekt konvertiert wurde. Es ist völlig normal, dass die
+Konvertierungsversuche entweder mit einer falschen Shape-Anweisung oder einer falschen Namenszuweisung fehlschlagen. Das liegt höchstwahrscheinlich daran, dass entweder
+Sie haben falsche Parameter in `BrandNewBertConfig()` verwendet, haben eine falsche Architektur in der 🤗 Transformers
+Implementierung, Sie haben einen Fehler in den `init()` Funktionen einer der Komponenten der 🤗 Transformers
+Implementierung oder Sie müssen eine der Kontrollpunktgewichte transponieren.
+
+Dieser Schritt sollte mit dem vorherigen Schritt wiederholt werden, bis alle Gewichte des Kontrollpunkts korrekt in das
+Transformers-Modell geladen sind. Nachdem Sie den Prüfpunkt korrekt in die 🤗 Transformers-Implementierung geladen haben, können Sie das Modell
+das Modell unter einem Ordner Ihrer Wahl `/path/to/converted/checkpoint/folder` speichern, der dann sowohl ein
+Datei `pytorch_model.bin` und eine Datei `config.json` enthalten sollte:
+
+```python
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+**7. Implementieren Sie den Vorwärtspass**
+
+Nachdem es Ihnen gelungen ist, die trainierten Gewichte korrekt in die 🤗 Transformers-Implementierung zu laden, sollten Sie nun dafür sorgen
+sicherstellen, dass der Forward Pass korrekt implementiert ist. In [Machen Sie sich mit dem ursprünglichen Repository vertraut](#3-4-führen-sie-einen-pre-training-checkpoint-mit-dem-original-repository-durch) haben Sie bereits ein Skript erstellt, das einen Forward Pass
+Durchlauf des Modells unter Verwendung des Original-Repositorys durchführt. Jetzt sollten Sie ein analoges Skript schreiben, das die 🤗 Transformers
+Implementierung anstelle der Originalimplementierung verwenden. Es sollte wie folgt aussehen:
+
+```python
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model(input_ids).last_hidden_states
+```
+
+Es ist sehr wahrscheinlich, dass die 🤗 Transformers-Implementierung und die ursprüngliche Modell-Implementierung nicht genau die gleiche Ausgabe liefern.
+beim ersten Mal nicht die gleiche Ausgabe liefern oder dass der Vorwärtsdurchlauf einen Fehler auslöst. Seien Sie nicht enttäuscht - das ist zu erwarten! Erstens,
+sollten Sie sicherstellen, dass der Vorwärtsdurchlauf keine Fehler auslöst. Es passiert oft, dass die falschen Dimensionen verwendet werden
+verwendet werden, was zu einem *Dimensionality mismatch* Fehler führt oder dass der falsche Datentyp verwendet wird, *z.B.* `torch.long`
+anstelle von `torch.float32`. Zögern Sie nicht, das Hugging Face Team um Hilfe zu bitten, wenn Sie bestimmte Fehler nicht lösen können.
+bestimmte Fehler nicht lösen können.
+
+Um sicherzustellen, dass die Implementierung von 🤗 Transformers korrekt funktioniert, müssen Sie sicherstellen, dass die Ausgaben
+einer Genauigkeit von `1e-3` entsprechen. Zunächst sollten Sie sicherstellen, dass die Ausgabeformen identisch sind, *d.h.*.
+Die Ausgabeform *outputs.shape* sollte für das Skript der 🤗 Transformers-Implementierung und die ursprüngliche
+Implementierung ergeben. Als nächstes sollten Sie sicherstellen, dass auch die Ausgabewerte identisch sind. Dies ist einer der schwierigsten
+Teile des Hinzufügens eines neuen Modells. Häufige Fehler, warum die Ausgaben nicht identisch sind, sind:
+
+- Einige Ebenen wurden nicht hinzugefügt, *d.h.* eine *Aktivierungsebene* wurde nicht hinzugefügt, oder die Restverbindung wurde vergessen
+- Die Worteinbettungsmatrix wurde nicht gebunden
+- Es werden die falschen Positionseinbettungen verwendet, da die ursprüngliche Implementierung einen Offset verwendet
+- Dropout wird während des Vorwärtsdurchlaufs angewendet. Um dies zu beheben, stellen Sie sicher, dass *model.training auf False* steht und dass keine Dropout
+  Schicht während des Vorwärtsdurchlaufs fälschlicherweise aktiviert wird, *d.h.* übergeben Sie *self.training* an [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)
+
+Der beste Weg, das Problem zu beheben, besteht normalerweise darin, sich den Vorwärtsdurchlauf der ursprünglichen Implementierung und die 🤗
+Transformers-Implementierung nebeneinander zu sehen und zu prüfen, ob es Unterschiede gibt. Idealerweise sollten Sie die
+Zwischenergebnisse beider Implementierungen des Vorwärtsdurchlaufs debuggen/ausdrucken, um die genaue Position im Netzwerk zu finden, an der die 🤗
+Transformers-Implementierung eine andere Ausgabe zeigt als die ursprüngliche Implementierung. Stellen Sie zunächst sicher, dass die
+hartcodierten `input_ids` in beiden Skripten identisch sind. Überprüfen Sie dann, ob die Ausgaben der ersten Transformation von
+der `input_ids` (normalerweise die Worteinbettungen) identisch sind. Und dann arbeiten Sie sich bis zur allerletzten Schicht des
+Netzwerks. Irgendwann werden Sie einen Unterschied zwischen den beiden Implementierungen feststellen, der Sie auf den Fehler
+in der Implementierung von 🤗 Transformers hinweist. Unserer Erfahrung nach ist ein einfacher und effizienter Weg, viele Druckanweisungen hinzuzufügen
+sowohl in der Original-Implementierung als auch in der 🤗 Transformers-Implementierung an den gleichen Stellen im Netzwerk
+hinzuzufügen und nacheinander Druckanweisungen zu entfernen, die dieselben Werte für Zwischenpräsentationen anzeigen.
+
+Wenn Sie sicher sind, dass beide Implementierungen die gleiche Ausgabe liefern, überprüfen Sie die Ausgaben mit
+`torch.allclose(original_output, output, atol=1e-3)` überprüfen, haben Sie den schwierigsten Teil hinter sich! Herzlichen Glückwunsch - die
+Arbeit, die noch zu erledigen ist, sollte ein Kinderspiel sein 😊.
+
+**8. Hinzufügen aller notwendigen Modelltests**
+
+An diesem Punkt haben Sie erfolgreich ein neues Modell hinzugefügt. Es ist jedoch sehr gut möglich, dass das Modell noch nicht
+noch nicht vollständig mit dem erforderlichen Design übereinstimmt. Um sicherzustellen, dass die Implementierung vollständig kompatibel mit 🤗 Transformers ist, sollten alle
+gemeinsamen Tests bestehen. Der Cookiecutter sollte automatisch eine Testdatei für Ihr Modell hinzugefügt haben, wahrscheinlich unter
+demselben `tests/models/brand_new_bert/test_modeling_brand_new_bert.py`. Führen Sie diese Testdatei aus, um zu überprüfen, ob alle gängigen
+Tests bestehen:
+
+```bash
+pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py
+```
+
+Nachdem Sie alle allgemeinen Tests festgelegt haben, müssen Sie nun sicherstellen, dass all die schöne Arbeit, die Sie geleistet haben, gut getestet ist, damit
+
+- a) die Community Ihre Arbeit leicht nachvollziehen kann, indem sie sich spezifische Tests von *brand_new_bert* ansieht
+- b) zukünftige Änderungen an Ihrem Modell keine wichtigen Funktionen des Modells zerstören.
+
+Als erstes sollten Sie Integrationstests hinzufügen. Diese Integrationstests tun im Wesentlichen dasselbe wie die Debugging-Skripte
+die Sie zuvor zur Implementierung des Modells in 🤗 Transformers verwendet haben. Eine Vorlage für diese Modelltests wurde bereits von dem
+Cookiecutter hinzugefügt, die `BrandNewBertModelIntegrationTests` heißt und nur noch von Ihnen ausgefüllt werden muss. Um sicherzustellen, dass diese
+Tests erfolgreich sind, führen Sie
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests
+```
+
+<Tip>
+
+Falls Sie Windows verwenden, sollten Sie `RUN_SLOW=1` durch `SET RUN_SLOW=1` ersetzen.
+
+</Tip>
+
+Zweitens sollten alle Funktionen, die speziell für *brand_new_bert* sind, zusätzlich in einem separaten Test getestet werden unter
+`BrandNewBertModelTester`/`BrandNewBertModelTest`. Dieser Teil wird oft vergessen, ist aber in zweierlei Hinsicht äußerst nützlich
+Weise:
+
+- Er hilft dabei, das Wissen, das Sie während der Modellerweiterung erworben haben, an die Community weiterzugeben, indem er zeigt, wie die
+  speziellen Funktionen von *brand_new_bert* funktionieren sollten.
+- Künftige Mitwirkende können Änderungen am Modell schnell testen, indem sie diese speziellen Tests ausführen.
+
+
+**9. Implementieren Sie den Tokenizer**
+
+Als nächstes sollten wir den Tokenizer von *brand_new_bert* hinzufügen. Normalerweise ist der Tokenizer äquivalent oder sehr ähnlich zu einem
+bereits vorhandenen Tokenizer von 🤗 Transformers.
+
+Es ist sehr wichtig, die ursprüngliche Tokenizer-Datei zu finden/extrahieren und es zu schaffen, diese Datei in die 🤗
+Transformers Implementierung des Tokenizers zu laden.
+
+Um sicherzustellen, dass der Tokenizer korrekt funktioniert, empfiehlt es sich, zunächst ein Skript im ursprünglichen Repository zu erstellen
+zu erstellen, das eine Zeichenkette eingibt und die `input_ids` zurückgibt. Es könnte etwa so aussehen (in Pseudocode):
+
+```python
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+Möglicherweise müssen Sie noch einmal einen Blick in das ursprüngliche Repository werfen, um die richtige Tokenizer-Funktion zu finden, oder Sie müssen
+Sie müssen vielleicht sogar Änderungen an Ihrem Klon des Original-Repositorys vornehmen, um nur die `input_ids` auszugeben. Nach dem Schreiben
+ein funktionierendes Tokenisierungsskript geschrieben, das das ursprüngliche Repository verwendet, sollten Sie ein analoges Skript für 🤗 Transformers
+erstellt werden. Es sollte ähnlich wie dieses aussehen:
+
+```python
+from transformers import BrandNewBertTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+
+input_ids = tokenizer(input_str).input_ids
+```
+
+Wenn beide `input_ids` die gleichen Werte ergeben, sollte als letzter Schritt auch eine Tokenizer-Testdatei hinzugefügt werden.
+
+Analog zu den Modellierungstestdateien von *brand_new_bert* sollten auch die Tokenisierungs-Testdateien von *brand_new_bert*
+eine Reihe von fest kodierten Integrationstests enthalten.
+
+**10. Führen Sie End-to-End-Integrationstests aus**
+
+Nachdem Sie den Tokenizer hinzugefügt haben, sollten Sie auch ein paar End-to-End-Integrationstests, die sowohl das Modell als auch den
+Tokenizer zu `tests/models/brand_new_bert/test_modeling_brand_new_bert.py` in 🤗 Transformers.
+Ein solcher Test sollte bei einem aussagekräftigen
+Text-zu-Text-Beispiel zeigen, dass die Implementierung von 🤗 Transformers wie erwartet funktioniert. Ein aussagekräftiges Text-zu-Text-Beispiel kann
+z.B. *ein Quell-zu-Ziel-Übersetzungspaar, ein Artikel-zu-Zusammenfassung-Paar, ein Frage-zu-Antwort-Paar, usw... Wenn keiner der
+der portierten Prüfpunkte in einer nachgelagerten Aufgabe feinabgestimmt wurde, genügt es, sich einfach auf die Modelltests zu verlassen. In einem
+letzten Schritt, um sicherzustellen, dass das Modell voll funktionsfähig ist, sollten Sie alle Tests auch auf der GPU durchführen. Es kann
+Es kann vorkommen, dass Sie vergessen haben, einige `.to(self.device)` Anweisungen zu internen Tensoren des Modells hinzuzufügen, was in einem solchen
+Test zu einem Fehler führen würde. Falls Sie keinen Zugang zu einem Grafikprozessor haben, kann das Hugging Face Team diese Tests für Sie durchführen.
+Tests für Sie übernehmen.
+
+**11. Docstring hinzufügen**
+
+Nun sind alle notwendigen Funktionen für *brand_new_bert* hinzugefügt - Sie sind fast fertig! Das Einzige, was Sie noch hinzufügen müssen, ist
+ein schöner Docstring und eine Doku-Seite. Der Cookiecutter sollte eine Vorlagendatei namens
+`docs/source/model_doc/brand_new_bert.md` hinzugefügt haben, die Sie ausfüllen sollten. Die Benutzer Ihres Modells werden in der Regel zuerst einen Blick auf
+diese Seite ansehen, bevor sie Ihr Modell verwenden. Daher muss die Dokumentation verständlich und prägnant sein. Es ist sehr nützlich für
+die Gemeinschaft, einige *Tipps* hinzuzufügen, um zu zeigen, wie das Modell verwendet werden sollte. Zögern Sie nicht, das Hugging Face-Team anzupingen
+bezüglich der Docstrings.
+
+Stellen Sie als nächstes sicher, dass der zu `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` hinzugefügte docstring
+korrekt ist und alle erforderlichen Eingaben und Ausgaben enthält. Wir haben eine ausführliche Anleitung zum Schreiben von Dokumentationen und unserem Docstring-Format [hier](writing-documentation). Es ist immer gut, sich daran zu erinnern, dass die Dokumentation
+mindestens so sorgfältig behandelt werden sollte wie der Code in 🤗 Transformers, denn die Dokumentation ist in der Regel der erste Kontaktpunkt der
+Berührungspunkt der Community mit dem Modell ist.
+
+**Code refactor**
+
+Großartig, jetzt haben Sie den gesamten erforderlichen Code für *brand_new_bert* hinzugefügt. An diesem Punkt sollten Sie einige mögliche
+falschen Codestil korrigieren, indem Sie ausführen:
+
+```bash
+make style
+```
+
+und überprüfen Sie, ob Ihr Kodierungsstil die Qualitätsprüfung besteht:
+
+```bash
+make quality
+```
+
+Es gibt noch ein paar andere sehr strenge Designtests in 🤗 Transformers, die möglicherweise noch fehlschlagen, was sich in den
+den Tests Ihres Pull Requests. Dies liegt oft an fehlenden Informationen im Docstring oder an einer falschen
+Benennung. Das Hugging Face Team wird Ihnen sicherlich helfen, wenn Sie hier nicht weiterkommen.
+
+Und schließlich ist es immer eine gute Idee, den eigenen Code zu refaktorisieren, nachdem man sichergestellt hat, dass er korrekt funktioniert. Wenn alle
+Tests bestanden haben, ist es nun an der Zeit, den hinzugefügten Code noch einmal durchzugehen und einige Überarbeitungen vorzunehmen.
+
+Sie haben nun den Codierungsteil abgeschlossen, herzlichen Glückwunsch! 🎉 Sie sind großartig! 😎
+
+**12. Laden Sie die Modelle in den Model Hub hoch**
+
+In diesem letzten Teil sollten Sie alle Checkpoints konvertieren und in den Modell-Hub hochladen und eine Modellkarte für jeden
+hochgeladenen Modell-Kontrollpunkt. Sie können sich mit den Hub-Funktionen vertraut machen, indem Sie unsere [Model sharing and uploading Page](model_sharing) lesen. Hier sollten Sie mit dem Hugging Face-Team zusammenarbeiten, um einen passenden Namen für jeden
+Checkpoint festzulegen und die erforderlichen Zugriffsrechte zu erhalten, um das Modell unter der Organisation des Autors *brand_new_bert* hochladen zu können.
+*brand_new_bert*. Die Methode `push_to_hub`, die in allen Modellen in `transformers` vorhanden ist, ist ein schneller und effizienter Weg, Ihren Checkpoint in den Hub zu pushen. Ein kleines Snippet ist unten eingefügt:
+
+```python
+brand_new_bert.push_to_hub("brand_new_bert")
+# Uncomment the following line to push to an organization.
+# brand_new_bert.push_to_hub("<organization>/brand_new_bert")
+```
+
+Es lohnt sich, etwas Zeit darauf zu verwenden, für jeden Kontrollpunkt passende Musterkarten zu erstellen. Die Modellkarten sollten die
+spezifischen Merkmale dieses bestimmten Prüfpunkts hervorheben, * z.B.* auf welchem Datensatz wurde der Prüfpunkt
+vortrainiert/abgestimmt? Für welche nachgelagerte Aufgabe sollte das Modell verwendet werden? Und fügen Sie auch etwas Code bei, wie Sie
+wie das Modell korrekt verwendet wird.
+
+**13. (Optional) Notizbuch hinzufügen**
+
+Es ist sehr hilfreich, ein Notizbuch hinzuzufügen, in dem im Detail gezeigt wird, wie *brand_new_bert* für Schlussfolgerungen verwendet werden kann und/oder
+bei einer nachgelagerten Aufgabe feinabgestimmt wird. Dies ist nicht zwingend erforderlich, um Ihren PR zusammenzuführen, aber sehr nützlich für die Gemeinschaft.
+
+**14. Reichen Sie Ihren fertigen PR ein**
+
+Sie sind jetzt mit der Programmierung fertig und können zum letzten Schritt übergehen, nämlich der Zusammenführung Ihres PR mit main. Normalerweise hat das
+Hugging Face Team Ihnen an diesem Punkt bereits geholfen haben, aber es lohnt sich, sich etwas Zeit zu nehmen, um Ihrem fertigen
+PR eine schöne Beschreibung zu geben und eventuell Kommentare zu Ihrem Code hinzuzufügen, wenn Sie Ihren Gutachter auf bestimmte Designentscheidungen hinweisen wollen.
+Gutachter hinweisen wollen.
+
+### Teilen Sie Ihre Arbeit!!
+
+Jetzt ist es an der Zeit, von der Community Anerkennung für Ihre Arbeit zu bekommen! Die Fertigstellung einer Modellergänzung ist ein wichtiger
+Beitrag zu Transformers und der gesamten NLP-Gemeinschaft. Ihr Code und die portierten vortrainierten Modelle werden sicherlich
+von Hunderten und vielleicht sogar Tausenden von Entwicklern und Forschern genutzt werden. Sie sollten stolz auf Ihre Arbeit sein und Ihre
+Ihre Leistung mit der Gemeinschaft teilen.
+
+**Sie haben ein weiteres Modell erstellt, das für jeden in der Community super einfach zugänglich ist! 🤯**
diff --git a/docs/transformers/docs/source/de/add_new_pipeline.md b/docs/transformers/docs/source/de/add_new_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..47d93e90ac14948b1d0429214efd281790701c01
--- /dev/null
+++ b/docs/transformers/docs/source/de/add_new_pipeline.md
@@ -0,0 +1,254 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Wie erstellt man eine benutzerdefinierte Pipeline?
+
+In dieser Anleitung sehen wir uns an, wie Sie eine benutzerdefinierte Pipeline erstellen und sie auf dem [Hub](https://hf.co/models) freigeben oder sie der
+🤗 Transformers-Bibliothek hinzufügen.
+
+Zuallererst müssen Sie entscheiden, welche Roheingaben die Pipeline verarbeiten kann. Es kann sich um Strings, rohe Bytes,
+Dictionaries oder was auch immer die wahrscheinlichste gewünschte Eingabe ist. Versuchen Sie, diese Eingaben so rein wie möglich in Python zu halten
+denn das macht die Kompatibilität einfacher (auch mit anderen Sprachen über JSON). Dies werden die Eingaben der
+Pipeline (`Vorverarbeitung`).
+
+Definieren Sie dann die `Outputs`. Dieselbe Richtlinie wie für die Eingänge. Je einfacher, desto besser. Dies werden die Ausgaben der
+Methode `Postprocess`.
+
+Beginnen Sie damit, die Basisklasse `Pipeline` mit den 4 Methoden zu erben, die für die Implementierung von `preprocess` benötigt werden,
+Weiterleiten", "Nachbearbeitung" und "Parameter säubern".
+
+
+```python
+from transformers import Pipeline
+
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "maybe_arg" in kwargs:
+            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, inputs, maybe_arg=2):
+        model_input = Tensor(inputs["input_ids"])
+        return {"model_input": model_input}
+
+    def _forward(self, model_inputs):
+        # model_inputs == {"model_input": model_input}
+        outputs = self.model(**model_inputs)
+        # Maybe {"logits": Tensor(...)}
+        return outputs
+
+    def postprocess(self, model_outputs):
+        best_class = model_outputs["logits"].softmax(-1)
+        return best_class
+```
+
+Die Struktur dieser Aufteilung soll eine relativ nahtlose Unterstützung für CPU/GPU ermöglichen und gleichzeitig die Durchführung von
+Vor-/Nachbearbeitung auf der CPU in verschiedenen Threads
+
+Preprocess" nimmt die ursprünglich definierten Eingaben und wandelt sie in etwas um, das in das Modell eingespeist werden kann. Es kann
+mehr Informationen enthalten und ist normalerweise ein `Dict`.
+
+`_forward` ist das Implementierungsdetail und ist nicht dafür gedacht, direkt aufgerufen zu werden. Weiterleiten" ist die bevorzugte
+aufgerufene Methode, da sie Sicherheitsvorkehrungen enthält, die sicherstellen, dass alles auf dem erwarteten Gerät funktioniert. Wenn etwas
+mit einem realen Modell verknüpft ist, gehört es in die Methode `_forward`, alles andere gehört in die Methoden preprocess/postprocess.
+
+Die Methode `Postprocess` nimmt die Ausgabe von `_forward` und verwandelt sie in die endgültige Ausgabe, die zuvor festgelegt wurde.
+zuvor entschieden wurde.
+
+Die Methode `_sanitize_parameters` ermöglicht es dem Benutzer, beliebige Parameter zu übergeben, wann immer er möchte, sei es bei der Initialisierung
+Zeit `pipeline(...., maybe_arg=4)` oder zur Aufrufzeit `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
+
+Die Rückgabe von `_sanitize_parameters` sind die 3 Dicts von kwargs, die direkt an `preprocess` übergeben werden,
+`_forward` und `postprocess` übergeben werden. Füllen Sie nichts aus, wenn der Aufrufer keinen zusätzlichen Parameter angegeben hat. Das
+erlaubt es, die Standardargumente in der Funktionsdefinition beizubehalten, was immer "natürlicher" ist.
+
+Ein klassisches Beispiel wäre das Argument `top_k` in der Nachbearbeitung bei Klassifizierungsaufgaben.
+
+```python
+>>> pipe = pipeline("my-new-task")
+>>> pipe("This is a test")
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+>>> pipe("This is a test", top_k=2)
+[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+```
+
+In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
+`_sanitize_parameters` to allow this new parameter.
+
+
+```python
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    # Add logic to handle top_k
+    return best_class
+
+
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Versuchen Sie, die Eingaben/Ausgaben sehr einfach und idealerweise JSON-serialisierbar zu halten, da dies die Verwendung der Pipeline sehr einfach macht
+ohne dass die Benutzer neue Arten von Objekten verstehen müssen. Es ist auch relativ üblich, viele verschiedene Arten von Argumenten zu unterstützen
+von Argumenten zu unterstützen (Audiodateien, die Dateinamen, URLs oder reine Bytes sein können).
+
+
+
+## Hinzufügen zur Liste der unterstützten Aufgaben
+
+Um Ihre `neue Aufgabe` in die Liste der unterstützten Aufgaben aufzunehmen, müssen Sie sie zur `PIPELINE_REGISTRY` hinzufügen:
+
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+)
+```
+
+Wenn Sie möchten, können Sie ein Standardmodell angeben. In diesem Fall sollte es mit einer bestimmten Revision (die der Name einer Verzweigung oder ein Commit-Hash sein kann, hier haben wir `"abcdef"` genommen) sowie mit dem Typ versehen sein:
+
+```python
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    default={"pt": ("user/awesome_model", "abcdef")},
+    type="text",  # current support type: text, audio, image, multimodal
+)
+```
+
+## Teilen Sie Ihre Pipeline auf dem Hub
+
+Um Ihre benutzerdefinierte Pipeline auf dem Hub freizugeben, müssen Sie lediglich den benutzerdefinierten Code Ihrer `Pipeline`-Unterklasse in einer
+Python-Datei speichern. Nehmen wir zum Beispiel an, Sie möchten eine benutzerdefinierte Pipeline für die Klassifizierung von Satzpaaren wie folgt verwenden:
+
+```py
+import numpy as np
+
+from transformers import Pipeline
+
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+Die Implementierung ist Framework-unabhängig und funktioniert für PyTorch- und TensorFlow-Modelle. Wenn wir dies in einer Datei
+einer Datei namens `pair_classification.py` gespeichert haben, können wir sie importieren und wie folgt registrieren:
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+Sobald dies geschehen ist, können wir es mit einem vortrainierten Modell verwenden. Zum Beispiel wurde `sgugger/finetuned-bert-mrpc` auf den
+auf den MRPC-Datensatz abgestimmt, der Satzpaare als Paraphrasen oder nicht klassifiziert.
+
+```py
+from transformers import pipeline
+
+classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc")
+```
+
+Dann können wir sie auf dem Hub mit der Methode `push_to_hub` freigeben:
+
+```py
+classifier.push_to_hub("test-dynamic-pipeline")
+```
+
+Dadurch wird die Datei, in der Sie `PairClassificationPipeline` definiert haben, in den Ordner `"test-dynamic-pipeline"` kopiert,
+und speichert das Modell und den Tokenizer der Pipeline, bevor Sie alles in das Repository verschieben
+`{Ihr_Benutzername}/test-dynamic-pipeline`. Danach kann jeder die Pipeline verwenden, solange er die Option
+`trust_remote_code=True` angeben:
+
+```py
+from transformers import pipeline
+
+classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True)
+```
+
+## Hinzufügen der Pipeline zu 🤗 Transformers
+
+Wenn Sie Ihre Pipeline zu 🤗 Transformers beitragen möchten, müssen Sie ein neues Modul im Untermodul `pipelines` hinzufügen
+mit dem Code Ihrer Pipeline hinzufügen. Fügen Sie es dann der Liste der in `pipelines/__init__.py` definierten Aufgaben hinzu.
+
+Dann müssen Sie noch Tests hinzufügen. Erstellen Sie eine neue Datei `tests/test_pipelines_MY_PIPELINE.py` mit Beispielen für die anderen Tests.
+
+Die Funktion `run_pipeline_test` ist sehr allgemein gehalten und läuft auf kleinen Zufallsmodellen auf jeder möglichen
+Architektur, wie durch `model_mapping` und `tf_model_mapping` definiert.
+
+Dies ist sehr wichtig, um die zukünftige Kompatibilität zu testen, d.h. wenn jemand ein neues Modell für
+`XXXForQuestionAnswering` hinzufügt, wird der Pipeline-Test versuchen, mit diesem Modell zu arbeiten. Da die Modelle zufällig sind, ist es
+ist es unmöglich, die tatsächlichen Werte zu überprüfen. Deshalb gibt es eine Hilfsfunktion `ANY`, die einfach versucht, die
+Ausgabe der Pipeline TYPE.
+
+Außerdem *müssen* Sie 2 (idealerweise 4) Tests implementieren.
+
+- `test_small_model_pt` : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
+  und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_tf`.
+- `test_small_model_tf` : Definieren Sie 1 kleines Modell für diese Pipeline (es spielt keine Rolle, ob die Ergebnisse keinen Sinn ergeben)
+  und testen Sie die Ausgaben der Pipeline. Die Ergebnisse sollten die gleichen sein wie bei `test_small_model_pt`.
+- `test_large_model_pt` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
+  Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen
+  sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt.
+- `test_large_model_tf` (`optional`): Testet die Pipeline an einer echten Pipeline, bei der die Ergebnisse
+  Sinn machen. Diese Tests sind langsam und sollten als solche gekennzeichnet werden. Hier geht es darum, die Pipeline zu präsentieren und sicherzustellen
+  sicherzustellen, dass es in zukünftigen Versionen keine Abweichungen gibt.
diff --git a/docs/transformers/docs/source/de/autoclass_tutorial.md b/docs/transformers/docs/source/de/autoclass_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5dea87ca552c1af2ee6953bcf424b12f7bc56217
--- /dev/null
+++ b/docs/transformers/docs/source/de/autoclass_tutorial.md
@@ -0,0 +1,131 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vortrainierte Instanzen mit einer AutoClass laden
+
+Bei so vielen verschiedenen Transformator-Architekturen kann es eine Herausforderung sein, eine für Ihren Checkpoint zu erstellen. Als Teil der 🤗 Transformers Kernphilosophie, die Bibliothek leicht, einfach und flexibel nutzbar zu machen, leitet eine `AutoClass` automatisch die richtige Architektur aus einem gegebenen Checkpoint ab und lädt sie. Mit der Methode `from_pretrained()` kann man schnell ein vortrainiertes Modell für eine beliebige Architektur laden, so dass man keine Zeit und Ressourcen aufwenden muss, um ein Modell von Grund auf zu trainieren. Die Erstellung dieser Art von Checkpoint-agnostischem Code bedeutet, dass Ihr Code, wenn er für einen Checkpoint funktioniert, auch mit einem anderen Checkpoint funktionieren wird - solange er für eine ähnliche Aufgabe trainiert wurde - selbst wenn die Architektur unterschiedlich ist.
+
+<Tip>
+
+Denken Sie daran, dass sich die Architektur auf das Skelett des Modells bezieht und die Checkpoints die Gewichte für eine bestimmte Architektur sind. Zum Beispiel ist [BERT](https://huggingface.co/google-bert/bert-base-uncased) eine Architektur, während `google-bert/bert-base-uncased` ein Checkpoint ist. Modell ist ein allgemeiner Begriff, der entweder Architektur oder Prüfpunkt bedeuten kann.
+
+</Tip>
+
+In dieser Anleitung lernen Sie, wie man:
+
+* Einen vortrainierten Tokenizer lädt.
+* Einen vortrainierten Merkmalsextraktor lädt.
+* Einen vortrainierten Prozessor lädt.
+* Ein vortrainiertes Modell lädt.
+
+## AutoTokenizer
+
+Nahezu jede NLP-Aufgabe beginnt mit einem Tokenizer. Ein Tokenizer wandelt Ihre Eingabe in ein Format um, das vom Modell verarbeitet werden kann.
+
+Laden Sie einen Tokenizer mit [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+Dann tokenisieren Sie Ihre Eingabe wie unten gezeigt:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoFeatureExtractor
+
+Für Audio- und Bildverarbeitungsaufgaben verarbeitet ein Merkmalsextraktor das Audiosignal oder Bild in das richtige Eingabeformat.
+
+Laden Sie einen Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Multimodale Aufgaben erfordern einen Prozessor, der zwei Arten von Vorverarbeitungswerkzeugen kombiniert. Das Modell [LayoutLMV2](model_doc/layoutlmv2) beispielsweise benötigt einen Feature-Extraktor für Bilder und einen Tokenizer für Text; ein Prozessor kombiniert beide.
+
+Laden Sie einen Prozessor mit [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Mit den `AutoModelFor`-Klassen können Sie schließlich ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+Für PyTorch-Modelle verwendet die Methode `from_pretrained()` `torch.load()`, die intern `pickle` verwendet und als unsicher bekannt ist. Generell sollte man niemals ein Modell laden, das aus einer nicht vertrauenswürdigen Quelle stammen könnte, oder das manipuliert worden sein könnte. Dieses Sicherheitsrisiko wird für öffentliche Modelle, die auf dem Hugging Face Hub gehostet werden, teilweise gemildert, da diese bei jeder Übertragung [auf Malware](https://huggingface.co/docs/hub/security-malware) gescannt werden. Siehe die [Hub-Dokumentation](https://huggingface.co/docs/hub/security) für Best Practices wie [signierte Commit-Verifizierung](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) mit GPG.
+
+TensorFlow- und Flax-Checkpoints sind nicht betroffen und können in PyTorch-Architekturen mit den Kwargs `from_tf` und `from_flax` für die Methode `from_pretrained` geladen werden, um dieses Problem zu umgehen.
+
+</Tip>
+
+Im Allgemeinen empfehlen wir die Verwendung der Klasse "AutoTokenizer" und der Klasse "AutoModelFor", um trainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
+</pt>
+<tf>
+Mit den Klassen `TFAutoModelFor` schließlich können Sie ein vortrainiertes Modell für eine bestimmte Aufgabe laden (siehe [hier](model_doc/auto) für eine vollständige Liste der verfügbaren Aufgaben). Laden Sie zum Beispiel ein Modell für die Sequenzklassifikation mit [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+Sie können denselben Prüfpunkt problemlos wiederverwenden, um eine Architektur für eine andere Aufgabe zu laden:
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+Im Allgemeinen empfehlen wir, die Klasse "AutoTokenizer" und die Klasse "TFAutoModelFor" zu verwenden, um vortrainierte Instanzen von Modellen zu laden. Dadurch wird sichergestellt, dass Sie jedes Mal die richtige Architektur laden. Im nächsten [Tutorial] (Vorverarbeitung) erfahren Sie, wie Sie Ihren neu geladenen Tokenizer, Feature Extractor und Prozessor verwenden, um einen Datensatz für die Feinabstimmung vorzuverarbeiten.
+</tf>
+</frameworkcontent>
diff --git a/docs/transformers/docs/source/de/contributing.md b/docs/transformers/docs/source/de/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..61ee8c3fc4e3472c1a94017d77915deae28e6dd3
--- /dev/null
+++ b/docs/transformers/docs/source/de/contributing.md
@@ -0,0 +1,332 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Zu 🤗 Transformers beitragen
+
+Jeder ist willkommen, einen Beitrag zu leisten, und wir schätzen den Beitrag jedes Einzelnen. Codebeiträge sind nicht der einzige Weg, der Community zu helfen. Fragen zu beantworten, anderen zu helfen und die Dokumentation zu verbessern, sind ebenfalls äußerst wertvoll.
+
+Es hilft uns auch, wenn Sie das Projekt weiterempfehlen! Erwähnen Sie die Bibliothek in Blogposts über die großartigen Projekte, die sie ermöglicht hat, tweeten Sie, wenn sie Ihnen geholfen hat, oder hinterlassen Sie dem Repository ein ⭐️, um Danke zu sagen.
+
+Wie auch immer Sie sich entscheiden beizutragen, seien Sie achtsam und respektieren Sie unseren [Verhaltenskodex](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
+
+**Dieser Leitfaden wurde stark durch den fantastischen [scikit-learn-Leitfaden für Beiträge](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md) inspiriert.**
+
+## Beitragsmöglichkeiten
+
+Es gibt mehrere Wege, wie Sie zu 🤗 Transformers beitragen können:
+
+* Beheben Sie bestehende Probleme im vorhandenen Code.
+* Erstellen Sie Issues im Zusammenhang mit Fehlern oder gewünschten neuen Funktionen.
+* Implementieren Sie neue Modelle.
+* Tragen Sie zu den Beispielen oder zur Dokumentation bei.
+
+Wenn Sie nicht wissen, wo Sie anfangen sollen, gibt es eine spezielle Liste von [Good First Issues](https://github.com/huggingface/transformers/contribute). Sie bietet Ihnen eine Liste offener und anfängerfreundlicher Probleme und hilft Ihnen, einen ersten Beitrag zu Open-Source zu leisten. Idealerweise erstellen Sie eine Pull-Anfrage und verlinken sie mit dem Issue, an dem Sie arbeiten möchten. Wir versuchen, erstellte PRs bevorzugt zu behandeln, da wir so den Fortschritt leicht verfolgen können, und die Option besteht, dass jemand anderes den PR übernehmen kann, falls der Beitragende keine Zeit mehr hat.
+
+Für etwas mehr Herausforderung, können Sie auch einen Blick auf die Liste der [Good Second Issues](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) werfen. Generell gilt: Legen Sie los, wenn Sie sich den Anforderungen gewachsen sehen und wir helfen Ihnen dabei! 🚀
+
+> Alle Beiträge sind für die Community gleichermaßen wertvoll. 🥰
+
+## Bestehende Probleme beheben
+
+Wenn Ihnen ein Problem im vorhandenen Code auffällt und Sie eine Lösung im Sinn haben, können Sie gerne einen Beitrag leisten und [eine Pull-Anfrage erstellen](#eine-pull-anfrage-erstellen)!
+
+## Ein fehlerspezifisches Issue oder eine Feature-Anfrage erstellen
+
+Tun Sie Ihr Bestes, diesen Richtlinien zu folgen, wenn Sie ein fehlerspezifisches Issue erstellen oder eine Feature-Anfrage einreichen. Das macht es uns leichter, Ihnen schnell und mit gutem Feedback zu antworten.
+
+### Haben Sie einen Fehler gefunden?
+
+Die 🤗 Transformers-Bibliothek verdankt ihre Robustheit und Zuverlässigkeit aller Nutzer, die frisch entdeckte Probleme melden.
+
+Wir würden es wirklich schätzen, wenn Sie **sicherstellen könnten, dass der Fehler noch nicht gemeldet wurde** (verwenden Sie die Suchleiste auf GitHub unter Issues), bevor Sie ein Issue erstellen. Ihr Problem sollte sich auch auf Fehler in der Bibliothek selbst und nicht auf Ihren eigenen Code beziehen. Wenn Sie sich nicht sicher sind, ob der Fehler in Ihrem eigenen Code oder der Bibliothek liegt, fragen Sie bitte zuerst im [Forum](https://discuss.huggingface.co/) nach. Das hilft uns, schneller auf Probleme im Zusammenhang mit der Bibliothek zu reagieren, anstatt auf allgemeine Fragen.
+
+Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geben Sie bitte die folgenden Informationen in Ihrem Issue an, damit wir es schnell beheben können:
+
+* Ihr **Betriebssystem und Version** sowie die Versionen von **Python**, **PyTorch** und **TensorFlow**, falls zutreffend.
+* Ein kurzes und unabhängiges Code-Snippet, das es uns ermöglicht, den Fehler in weniger als 30 Sekunden nachzustellen.
+* Den *vollständigen* Traceback, wenn eine Ausnahme geworfen wird.
+* Fügen Sie weitere hilfreiche Informationen, wie z. B. Screenshots, an.
+
+Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:
+
+```bash
+transformers-cli env
+```
+
+Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+### Möchten Sie eine neue Funktion?
+
+Wenn Sie eine bestimmte neue Funktion in 🤗 Transformers sehen möchten, erstellen Sie bitte ein Issue und fügen Sie eine Beschreibung hinzu:
+
+1. Was ist die *Motivation* hinter dieser Funktion? Steht sie in Zusammenhang mit einem Problem oder einer Frustration mit der Bibliothek? Ist es eine Funktion, die Sie für ein Projekt benötigen? Ist es etwas, an dem Sie gearbeitet haben und denken, dass es der Community nutzen könnte?
+
+   Was auch immer es ist, wir würden uns freuen, davon zu hören!
+
+1. Beschreiben Sie Ihre gewünschte Funktion so detailliert wie möglich. Je mehr Sie uns darüber erzählen können, desto besser können wir Ihnen helfen.
+1. Stellen Sie einen *Code-Schnipsel* bereit, der die Funktionsweise demonstriert.
+1. Falls die Funktion auf einem Paper beruht, verlinken Sie dieses bitte.
+
+Wenn Ihr Issue gut geschrieben ist, sind wir zum Zeitpunkt seiner Erstellung bereits zu 80 % fertig.
+
+Wir haben [Vorlagen](https://github.com/huggingface/transformers/tree/main/templates) hinzugefügt, um Ihnen den Start Ihres Issues zu erleichtern.
+
+## Möchten Sie ein neues Modell implementieren?
+
+Es werden ständig neue Modelle veröffentlicht. Wenn Sie ein neues Modell implementieren möchten, geben Sie bitte folgende Informationen an:
+
+* Eine kurze Beschreibung des Modells und einen Link zum Paper.
+* Link zur Implementierung, falls sie Open-Source ist.
+* Link zu den Modellgewichten, falls verfügbar.
+
+Lassen Sie es uns wissen, wenn Sie bereit sind, das Modell selbst beizutragen. Dann können wir Ihnen helfen, es zu 🤗 Transformers hinzuzufügen!
+
+Wir haben auch einen technischen Leitfaden dazu, [wie man ein Modell zu 🤗 Transformers hinzufügt](https://huggingface.co/docs/transformers/add_new_model).
+
+## Möchten Sie die Dokumentation erweitern?
+
+Wir sind immer auf der Suche nach Verbesserungen, die die Dokumentation klarer und präziser machen. Bitte teilen Sie uns Verbesserungsvorschläge mit, wie z. B. Tippfehler und fehlende, unklare oder ungenaue Inhalte. Wir übernehmen gerne die Änderungen oder helfen Ihnen, einen Beitrag zu leisten, wenn Sie daran interessiert sind!
+
+Für weitere Einzelheiten darüber, wie man die Dokumentation generiert, erstellt und schreibt, werfen Sie einen Blick auf das [README](https://github.com/huggingface/transformers/tree/main/docs) der Dokumentation.
+
+## Eine Pull-Anfrage erstellen
+
+Bevor Sie irgendwelchen Code schreiben, empfehlen wir Ihnen dringend, die bestehenden PRs oder Issues zu durchsuchen, um sicherzustellen, dass niemand bereits an diesem Thema arbeitet. Wenn Sie sich unsicher sind, ist es immer eine gute Idee, nach Feedback in einem neuen Issue zu fragen.
+
+Sie benötigen grundlegende `git`-Kenntnisse, um zu 🤗 Transformers beizutragen. Obwohl `git` nicht das einfachste Werkzeug ist, hat es ein sehr gutes Handbuch. Geben Sie `git --help` in eine Shell ein und genießen Sie es! Wenn Sie Bücher bevorzugen, ist [Pro Git](https://git-scm.com/book/en/v2) eine gute Anlaufstelle.
+
+Sie benötigen **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen:
+
+1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf den **[Fork](https://github.com/huggingface/transformers/fork)**-Button auf der Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes auf Ihrem GitHub-Account erstellt.
+
+1. Klonen Sie Ihren Fork auf Ihre lokale Festplatte und fügen Sie das ursprüngliche Repository als Remote hinzu:
+
+   ```bash
+   git clone git@github.com:<your Github handle>/transformers.git
+   cd transformers
+   git remote add upstream https://github.com/huggingface/transformers.git
+   ```
+
+1. Erstellen Sie einen neuen Branch, um Ihre Änderungen zu speichern:
+
+   ```bash
+   git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   🚨 Arbeiten Sie **nicht** auf dem `main` Branch!
+
+1. Richten Sie eine Entwicklungsumgebung ein, indem Sie den folgenden Befehl in einer virtuellen Umgebung ausführen:
+
+   ```bash
+   pip install -e ".[dev]"
+   ```
+
+   Wenn 🤗 Transformers bereits in der virtuellen Umgebung installiert war, entfernen Sie es mit `pip uninstall transformers`, bevor Sie es im bearbeitbaren Modus mit dem `-e` Flag neu installieren.
+
+   Abhängig von Ihrem Betriebssystem und durch die wachsende Anzahl der optionalen Abhängigkeiten von Transformers könnten Sie mit diesem Befehl einen Fehler verursachen. Wenn das der Fall ist, stellen Sie sicher, dass Sie ihr bevorzugtes Deep-Learning-Framework (PyTorch, TensorFlow und/oder Flax) installieren und anschließend den folgenden Befehl ausführen:
+
+   ```bash
+   pip install -e ".[quality]"
+   ```
+
+   Dies sollte für die meisten Anwendungsfälle ausreichend sein.
+
+1. Entwickeln Sie die Funktionen in Ihrem Branch.
+
+   Während Sie an Ihrem Code arbeiten, sollten Sie sicherstellen, dass die Test-Suite erfolgreich durchläuft. Führen Sie die von Ihren Änderungen betroffenen Tests wie folgt aus:
+
+   ```bash
+   pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   Weitere Informationen über Tests finden Sie in der Anleitung zum Thema [Testen](https://huggingface.co/docs/transformers/testing).
+
+   🤗 Transformers stützt sich auf `black` und `ruff`, um seinen Quellcode konsistent zu formatieren. Nachdem Sie Änderungen vorgenommen haben, wenden Sie automatische Stilkorrekturen und Codeprüfungen, die nicht automatisiert werden können, in einem Schritt an:
+
+   ```bash
+   make fixup
+   ```
+
+   Dieser Task ist optimiert, nur mit Dateien zu arbeiten, die von Ihrer PR modifiziert wurden.
+
+   Wenn Sie die Prüfungen nacheinander ausführen möchten, wendet der folgende Befehl die Stilkorrekturen an:
+
+   ```bash
+   make style
+   ```
+
+   🤗 Transformers verwendet auch `ruff` und einige benutzerdefinierte Skripte, um auf Programmierfehler zu prüfen. Qualitätskontrollen werden von der CI durchgeführt, aber Sie können die gleichen Überprüfungen auch selbst ausführen:
+
+   ```bash
+   make quality
+   ```
+
+   Abschließend haben wir viele Skripte, die sicherstellen, dass wir alle betroffenen Dateien aktualisieren, wenn wir ein neues Modell hinzufügen. Sie können diese wie folgt ausführen:
+
+   ```bash
+   make repo-consistency
+   ```
+
+   Um mehr über diese Prüfungen zu erfahren und wie man mit ihnen Probleme behebt, lesen Sie den Leitfaden zu [Überprüfungen bei einer Pull-Anfrage](https://huggingface.co/docs/transformers/pr_checks).
+
+   Wenn Sie Dokumente im Verzeichnis `docs/source` ändern, stellen Sie sicher, dass die Dokumentation noch generiert werden kann. Diese Prüfung wird auch im CI laufen, wenn Sie eine Pull-Anfrage erstellen. Um eine lokale Prüfung durchzuführen, müssen Sie den Dukumentation-Builder installieren:
+
+   ```bash
+   pip install ".[docs]"
+   ```
+
+   Führen Sie den folgenden Befehl im Hauptverzeichnis des Repositorys aus:
+
+   ```bash
+   doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
+   ```
+
+   Dadurch wird die Dokumentation im Ordner `~/tmp/test-build` erstellt, wo Sie die erzeugten Markdown-Dateien mit Ihrem bevorzugten Editor überprüfen können. Sie können auch eine Vorschau der Dokumentation auf GitHub sehen, wenn Sie eine Pull-Anfrage öffnen.
+
+   Wenn Sie mit Ihren Änderungen zufrieden sind, fügen Sie die geänderten Dateien mit `git add` hinzu und speichern Sie Ihre Änderungen lokal mit `git commit`:
+
+   ```bash
+   git add modified_file.py
+   git commit
+   ```
+
+   Bitte achten Sie darauf, [gute Commit-Nachrichten](https://chris.beams.io/posts/git-commit/) zu schreiben, um die von Ihnen vorgenommenen Änderungen klar zu kommunizieren!
+
+   Um Ihre Kopie des Codes auf dem aktuellen Stand des ursprünglichen Repositorys zu halten, rebasen Sie Ihren Branch auf `upstream/branch` *bevor* Sie eine Pull-Anfrage öffnen oder falls Sie von einem Maintainer dazu aufgefordert werden:
+
+   ```bash
+   git fetch upstream
+   git rebase upstream/main
+   ```
+
+   Pushen Sie Ihre Änderungen in Ihrem Branch:
+
+   ```bash
+   git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+   Wenn Sie bereits eine Pull-Anfrage erstellt haben, müssen Sie den Push mit dem `--force` Flag erzwingen. Andernfalls, wenn die Pull-Anfrage noch nicht erstellt wurde, können Sie Ihre Änderungen normal pushen.
+
+1. Jetzt können Sie zu Ihrem Fork des Repositorys auf GitHub gehen und auf **Pull-Anfrage** klicken, um eine Pull-Anfrage zu erstellen. Stellen Sie sicher, dass Sie alle Punkte auf unserer [Checkliste](#checkliste-für-pull-anfragen) unten abhaken. Wenn Sie fertig sind, können Sie Ihre Änderungen zur Überprüfung an die Projektverantwortlichen senden.
+
+1. Es ist kein Problem, wenn die Maintainer Änderungen beantragen, das geschieht auch bei unseren Kernmitarbeitern! Damit jeder die Änderungen in der Pull-Anfrage sehen kann, arbeiten Sie in Ihrem lokalen Branch und pushen die Änderungen zu Ihrem Fork. Sie werden automatisch in der Pull-Anfrage erscheinen.
+
+### Checkliste für Pull-Anfragen
+
+☐ Der Titel der Pull-Anfrage sollte Ihren Beitrag zusammenfassen.<br>
+☐ Wenn Ihre Pull-Anfrage ein bestimmtes Issue bearbeitet, erwähnen Sie bitte die zugehörige Nummer in der Beschreibung der Pull-Anfrage, sodass diese verlinkt sind (und Personen, die das Issue lesen, wissen, dass Sie daran arbeiten).<br>
+☐ Um eine fortlaufende Bearbeitung anzuzeigen, versehen Sie bitte den Titel mit einem `[WIP]` Präfix. Diese sind nützlich, um doppelte Arbeit zu verhindern und sie von PRs abzuheben, die bereit zum Zusammenführen sind.<br>
+☐ Stellen Sie sicher, dass existierende Tests bestanden werden.<br>
+☐ Wenn Sie eine neue Funktion hinzufügen, erstellen Sie auch Tests dafür.<br>
+
+* Wenn Sie ein neues Modell hinzufügen, stellen Sie sicher, dass Sie `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` verwenden, um die gemeinsamen Tests auszulösen.
+* Wenn Sie neue `@slow` Tests hinzufügen, stellen Sie mit `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py` sicher, dass diese erfolgreich durchlaufen.
+* Wenn Sie einen neuen Tokenizer hinzufügen, schreiben Sie Tests und stellen Sie mit `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` sicher, dass diese erfolgreich durchlaufen.
+* CircleCI führt die langsamen Tests nicht aus, aber GitHub Actions tut dies jede Nacht!<br>
+
+☐ Alle public Methoden müssen informative Docstrings haben (siehe [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) als Beispiel).<br>
+☐ Aufgrund des schnell wachsenden Repositorys fügen Sie bitte keine Bilder, Videos oder andere Nicht-Textdateien hinzu, die das Repository erheblich belasten würden. Verwenden Sie stattdessen ein Hub-Repository wie [`hf-internal-testing`](https://huggingface.co/hf-internal-testing), um diese Dateien zu hosten und sie per URL zu verlinken. Wir empfehlen Bilder, die zur Dokumentation gehören, im folgenden Repository abzulegen: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images). Sie können eine PR in diesem Datasets-Repository erstellen und ein Hugging-Face-Mitglied bitten, sie zu mergen.
+
+Um mehr über die Prüfungen zu erfahren, die bei einer Pull-Anfrage ausgelöst werden, lesen Sie unseren Leitfaden zu [Überprüfungen bei einer Pull-Anfrage](https://huggingface.co/docs/transformers/pr_checks).
+
+### Tests
+
+Eine umfangreiche Test-Suite ist enthalten, um das Verhalten der Bibliothek und mehrerer Beispiele zu testen. Tests für die Bibliothek und Beispiele finden Sie jeweils im [tests](https://github.com/huggingface/transformers/tree/main/tests) und im [examples](https://github.com/huggingface/transformers/tree/main/examples) Ordner.
+
+Wir bevorzugen `pytest` und `pytest-xdist`, weil es schneller ist. Geben Sie einen *Pfad zu einem Unterordner oder einer Testdatei* vom Hauptverzeichnis des Repositorys aus an, um den Test auszuführen:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+```
+
+Analog für den `examples` Ordner, geben Sie einen *Pfad zu einem Unterordner oder einer Testdatei* an, um den Test auszuführen. Z. B. führt der folgende Befehl den Test des Unterordners für Textklassifizierung im PyTorch `examples` Ordner durch:
+
+```bash
+pip install -r examples/xxx/requirements.txt  # nur beim ersten Mal erforderlich
+python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+Tatsächlich ist dies genau, wie unsere `make test` und `make test-examples` Befehle implementiert sind (abgesehen von `pip install`)!
+
+Sie können auch eine kleinere Anzahl an Tests angeben, um nur die Funktion, an der Sie arbeiten, zu testen.
+
+Standardmäßig werden langsame Tests übersprungen, aber Sie können die Umgebungsvariable `RUN_SLOW` auf `yes` setzen, um sie auszuführen. Dies wird den Download vieler Gigabyte an Modellen starten - stellen Sie also sicher, dass Sie sowohl genügend Festplattenspeicher als auch eine gute Internetverbindung oder die nötige Geduld haben!
+
+<Tip warning={true}>
+
+Vergessen Sie nicht, einen *Pfad zu einem Unterordner oder einer Testdatei* anzugeben, um den Test auszuführen. Sonst führen Sie alle Tests im `tests` oder `examples` Ordner aus, was sehr lange dauern wird!
+
+</Tip>
+
+```bash
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+Wie bei den langsamen Tests gibt es auch andere Umgebungsvariablen, die standardmäßig beim Testen nicht gesetzt sind:
+
+* `RUN_CUSTOM_TOKENIZERS`: Aktiviert Tests für benutzerdefinierte Tokenizer.
+
+Weitere Umgebungsvariablen und zusätzliche Informationen finden Sie in der [testing_utils.py](src/transformers/testing_utils.py).
+
+🤗 Transformers verwendet `pytest` nur als Test-Runner. Es verwendet keine `pytest`-spezifischen Funktionen in der Test-Suite selbst.
+
+Das bedeutet, `unittest` wird vollständig unterstützt. Folgend wird beschrieben, wie man Tests mit `unittest` ausführt:
+
+```bash
+python -m unittest discover -s tests -t . -v
+python -m unittest discover -s examples -t examples -v
+```
+
+### Stil-Leitfaden
+
+Für Docstrings befolgt 🤗 Transformers den [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
+Lesen Sie unseren [Leitfaden zum Schreiben von Dokumentationen](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) für weitere Informationen.
+
+### Entwickeln unter Windows
+
+Unter Windows (falls Sie nicht im [Windows-Subsystem für Linux](https://learn.microsoft.com/en-us/windows/wsl/) oder WSL arbeiten) müssen Sie git so konfigurieren, dass Windows `CRLF` in Linux `LF` Zeilenenden umgewandelt werden:
+
+```bash
+git config core.autocrlf input
+```
+
+Eine Möglichkeit, den `make`-Befehl unter Windows auszuführen, ist mit MSYS2:
+
+1. Laden Sie [MSYS2](https://www.msys2.org/) herunter und installieren Sie es nach `C:\msys64`.
+1. Öffnen Sie die Kommandozeile `C:\msys64\msys2.exe` (sie sollte vom **Start**-Menü aus verfügbar sein).
+1. Führen Sie den Befehl in der Shell aus: `pacman -Syu` und installieren Sie `make` mit `pacman -S make`.
+1. Fügen Sie `C:\msys64\usr\bin` an Ihrer PATH-Umgebungsvariable an.
+
+Sie können nun `make` aus jedem Terminal heraus verwenden (PowerShell, cmd.exe usw.)! 🎉
+
+### Ein geforktes Repository mit dem Haupt-Repository von Hugging Face synchronisieren
+
+Beim Aktualisieren des main-Branches eines geforkten Repositories beachten Sie bitte die folgenden Schritte, um das Anpingen des Haupt-Repositorys zu vermeiden, was unnötige Verweise in abhängigen PRs vermerkt und beteiligte Entwickler benachrichtigt:
+
+1. Wenn möglich, vermeiden Sie die Synchronisation mit dem Haupt-Repository über einen Branch und PR im geforkten Repository. Mergen Sie stattdessen direkt in den main-Branch des Forks.
+1. Wenn ein PR unbedingt notwendig ist, verwenden Sie die folgenden Schritte, nachdem Sie Ihren Branch ausgecheckt haben:
+
+   ```bash
+   git checkout -b your-branch-for-syncing
+   git pull --squash --no-commit upstream main
+   git commit -m '<your message without GitHub references>'
+   git push --set-upstream origin your-branch-for-syncing
+   ```
diff --git a/docs/transformers/docs/source/de/index.md b/docs/transformers/docs/source/de/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..8aaaa5952c07fe55e373091df8b71f4b88974de0
--- /dev/null
+++ b/docs/transformers/docs/source/de/index.md
@@ -0,0 +1,334 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers
+
+Maschinelles Lernen auf dem neuesten Stand der Technik für PyTorch, TensorFlow und JAX.
+
+🤗 Transformers bietet APIs zum einfachen Herunterladen und Trainieren von vortrainierten Modellen auf dem neuesten Stand der Technik. Die Verwendung von vortrainierten Modellen kann Rechenkosten sparen und den CO2-Fußabdruck reduzieren und Zeit sparen, die für das Training eines Modells von Grund auf benötigt wird. Die Modelle können für verschiedene Modalitäten verwendet werden, wie z. B.:
+
+* 📝 Text: Textklassifizierung, Informationsextrahierung, Beantwortung von Fragen, Zusammenfassung, Übersetzung und Texterstellung in über 100 Sprachen.
+* 🖼️ Bilder: Bildklassifizierung, Objekterkennung und Segmentierung.
+* 🗣️ Audio: Spracherkennung und Audioklassifizierung.
+* 🐙 Multimodal: Beantwortung von Tabellenfragen, optische Zeichenerkennung, Informationsextraktion aus gescannten Dokumenten, Videoklassifizierung und Beantwortung visueller Fragen.
+
+Unsere Bibliothek unterstützt die nahtlose Integration von drei der beliebtesten Deep-Learning-Bibliotheken: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) und [JAX](https://jax.readthedocs.io/en/latest/). Trainieren Sie Ihr Modell in drei Codezeilen in einem Framework und laden Sie es zur Inferenz mit einem anderen.
+
+Jede 🤗 Transformers-Architektur ist in einem eigenständigen Python-Modul definiert, so dass sie leicht für Forschung und Experimente angepasst werden kann.
+
+## Wenn Sie auf der Suche nach individueller Unterstützung durch das Hugging Face-Team sind
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a>
+
+## Inhalt
+
+Die Dokumentation ist in fünf Teile gegliedert:
+
+- **GET STARTED** enthält eine kurze Tour und Installationsanweisungen, um mit 🤗 Transformers loszulegen.
+- **TUTORIALS** sind ein hervorragender Ausgangspunkt, wenn Sie neu in unserer Bibliothek sind. Dieser Abschnitt hilft Ihnen, die grundlegenden Fähigkeiten zu erlangen, die Sie benötigen, um mit 🤗 Transformers zu arbeiten.
+- **HOW-TO GUIDES** zeigen Ihnen, wie Sie ein bestimmtes Ziel erreichen können, z. B. die Feinabstimmung eines vortrainierten Modells für die Sprachmodellierung oder die Erstellung eines benutzerdefinierten Modellkopfs.
+- **KONZEPTUELLE ANLEITUNGEN** bietet weitere Diskussionen und Erklärungen zu den zugrunde liegenden Konzepten und Ideen hinter Modellen, Aufgaben und der Designphilosophie von 🤗 Transformers. 
+- **API** beschreibt jede Klasse und Funktion, gruppiert in:
+
+  - **MAIN CLASSES** für die Hauptklassen, die die wichtigsten APIs der Bibliothek darstellen.
+  - MODELLE** für die Klassen und Funktionen, die zu jedem in der Bibliothek implementierten Modell gehören.
+  - **INTERNAL HELPERS** für die Klassen und Funktionen, die wir intern verwenden.
+
+Die Bibliothek enthält derzeit JAX-, PyTorch- und TensorFlow-Implementierungen, vortrainierte Modellgewichte, Nutzungsskripte und Konvertierungsprogramme für die folgenden Modelle.
+
+### Unterstütze Modelle
+
+<!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
+
+1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLOOM](model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CodeGen](model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers-research-projects/tree/main/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers-research-projects/tree/main/distillation) and a German version of DistilBERT.
+1. **[DiT](model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientNet](model_doc/efficientnet)** (from Google Research) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946)  by Mingxing Tan and Quoc V. Le.
+1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GLPN](model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever.
+1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPTSAN-japanese](model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[GroupViT](model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[Hubert](model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[LayoutLM](model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[Longformer](model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[Mask2Former](model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[mBART](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[mLUKE](model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MobileBERT](model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileViT](model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MVP](model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[Nezha](model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Trajectory Transformer](model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[XGLM](model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+
+
+### Unterstützte Frameworks
+
+Die folgende Tabelle zeigt die derzeitige Unterstützung in der Bibliothek für jedes dieser Modelle, unabhängig davon, ob sie einen Python
+Tokenizer haben (als "langsam" bezeichnet), ein "schneller" Tokenizer, der von der 🤗 Tokenizers Bibliothek unterstützt wird, ob sie Unterstützung in Jax (via
+Flax), PyTorch, und/oder TensorFlow haben.
+
+<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|       BigBird-Pegasus       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            BLOOM            |       ❌       |       ✅       |       ✅        |         ❌         |      ✅      |
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           CANINE            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|           CodeGen           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          ConvNeXT           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             CvT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Data2VecVision        |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|    Decision Transformer     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             DPT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            FLAVA            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            GLPN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|          GPT NeoX           |       ❌       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          GroupViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         LayoutLMv3          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            LeViT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           LongT5            |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           M-CTC-T           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|        Megatron-BERT        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          MobileViT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             MT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             MVP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Nezha            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        Nyströmformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|             OPT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           OWL-ViT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            REALM            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RegNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ResNet            |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|      Swin Transformer       |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|     Swin Transformer V2     |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|   Trajectory Transformer    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|         VisualBERT          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
+|           ViTMAE            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
+|     Wav2Vec2-Conformer      |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       XLM-ProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+|       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            YOLOS            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+
+<!-- End table-->
diff --git a/docs/transformers/docs/source/de/installation.md b/docs/transformers/docs/source/de/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..44b6f1ed981e1ef3531a344877088290c3e7bdb8
--- /dev/null
+++ b/docs/transformers/docs/source/de/installation.md
@@ -0,0 +1,250 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Installation
+
+Installieren Sie 🤗 Transformers für die Deep-Learning-Bibliothek, mit der Sie arbeiten, richten Sie Ihren Cache ein und konfigurieren Sie 🤗 Transformers optional für den Offline-Betrieb.
+
+🤗 Transformers wurde unter Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, und Flax getestet. Folgen Sie den Installationsanweisungen unten für die von Ihnen verwendete Deep-Learning-Bibliothek:
+
+* [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
+* [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
+* [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
+
+## Installation mit pip
+
+Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, werfen Sie einen Blick auf diese [Anleitung](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Eine virtuelle Umgebung macht es einfacher, verschiedene Projekte zu verwalten und Kompatibilitätsprobleme zwischen Abhängigkeiten zu vermeiden.
+
+Beginnen wir mit der Erstellung einer virtuellen Umgebung in Ihrem Projektverzeichnis:
+
+
+```bash
+python -m venv .env
+```
+
+Aktivieren wir die virtuelle Umgebung. Unter Linux und MacOs:
+
+```bash
+source .env/bin/activate
+```
+Aktivieren wir die virtuelle Umgebung unter Windows
+
+```bash
+.env/Scripts/activate
+```
+
+Jetzt können wir die 🤗 Transformers mit dem folgenden Befehl installieren:
+
+```bash
+pip install transformers
+```
+
+Bei reiner CPU-Unterstützung können wir 🤗 Transformers und eine Deep-Learning-Bibliothek bequem in einer Zeile installieren. Installieren wir zum Beispiel 🤗 Transformers und PyTorch mit:
+
+```bash
+pip install transformers[torch]
+```
+
+🤗 Transformers und TensorFlow 2.0:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+🤗 Transformers und Flax:
+
+```bash
+pip install transformers[flax]
+```
+
+Überprüfen wir abschließend, ob 🤗 Transformers ordnungsgemäß installiert wurde, indem wir den folgenden Befehl ausführen. Es wird ein vortrainiertes Modell heruntergeladen:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Dann wird die Kategorie und die Wahrscheinlichkeit ausgegeben:
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Installation aus dem Code
+
+Installieren wir 🤗 Transformers aus dem Quellcode mit dem folgenden Befehl:
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+Dieser Befehl installiert die aktuelle `main` Version und nicht die neueste `stable` Version. Die `main`-Version ist nützlich, um mit den neuesten Entwicklungen Schritt zu halten. Zum Beispiel, wenn ein Fehler seit der letzten offiziellen Version behoben wurde, aber eine neue Version noch nicht veröffentlicht wurde. Das bedeutet jedoch, dass die "Hauptversion" nicht immer stabil ist. Wir bemühen uns, die Hauptversion einsatzbereit zu halten, und die meisten Probleme werden normalerweise innerhalb weniger Stunden oder eines Tages behoben. Wenn Sie auf ein Problem stoßen, öffnen Sie bitte ein [Issue](https://github.com/huggingface/transformers/issues), damit wir es noch schneller beheben können!
+
+Überprüfen wir, ob 🤗 Transformers richtig installiert wurde, indem Sie den folgenden Befehl ausführen:
+
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Editierbare Installation
+
+Sie benötigen eine bearbeitbare Installation, wenn Sie:
+
+* die "Haupt"-Version des Quellcodes verwenden möchten.
+* Zu 🤗 Transformers beitragen und Änderungen am Code testen wollen.
+
+Klonen Sie das Repository und installieren 🤗 Transformers mit den folgenden Befehlen:
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Diese Befehle verknüpfen den Ordner, in den Sie das Repository geklont haben, mit den Pfaden Ihrer Python-Bibliotheken. Python wird nun in dem Ordner suchen, in den Sie geklont haben, zusätzlich zu den normalen Bibliothekspfaden. Wenn zum Beispiel Ihre Python-Pakete normalerweise in `~/anaconda3/envs/main/lib/python3.7/site-packages/` installiert sind, wird Python auch den Ordner durchsuchen, in den Sie geklont haben: `~/transformers/`.
+
+
+<Tip warning={true}>
+
+Sie müssen den Ordner `transformers` behalten, wenn Sie die Bibliothek weiter verwenden wollen.
+
+</Tip>
+
+Jetzt können Sie Ihren Klon mit dem folgenden Befehl ganz einfach auf die neueste Version von 🤗 Transformers aktualisieren:
+
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Ihre Python-Umgebung wird beim nächsten Ausführen die `main`-Version von 🤗 Transformers finden.
+
+## Installation mit conda
+
+Installation von dem conda Kanal `conda-forge`:
+
+```bash
+conda install conda-forge::transformers
+```
+
+## Cache Einrichtung
+
+Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben:
+
+1. Shell-Umgebungsvariable (Standard): `HF_HUB_CACHE` oder `TRANSFORMERS_CACHE`.
+2. Shell-Umgebungsvariable: `HF_HOME`.
+3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`.
+
+
+<Tip>
+
+Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` oder `PYTORCH_PRETRAINED_BERT_CACHE`, wenn Sie von einer früheren Iteration dieser Bibliothek kommen und diese Umgebungsvariablen gesetzt haben, sofern Sie nicht die Shell-Umgebungsvariable `TRANSFORMERS_CACHE` angeben.
+
+</Tip>
+
+## Offline Modus
+
+Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `HF_HUB_OFFLINE=1`, um dieses Verhalten zu aktivieren.
+
+<Tip>
+
+Fügen sie [🤗 Datasets](https://huggingface.co/docs/datasets/) zu Ihrem Offline-Trainingsworkflow hinzufügen, indem Sie die Umgebungsvariable `HF_DATASETS_OFFLINE=1` setzen.
+
+</Tip>
+
+So würden Sie beispielsweise ein Programm in einem normalen Netzwerk mit einer Firewall für externe Instanzen mit dem folgenden Befehl ausführen:
+
+```bash
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Führen Sie das gleiche Programm in einer Offline-Instanz mit aus:
+
+```bash
+HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Das Skript sollte nun laufen, ohne sich aufzuhängen oder eine Zeitüberschreitung abzuwarten, da es weiß, dass es nur nach lokalen Dateien suchen soll.
+
+
+### Abrufen von Modellen und Tokenizern zur Offline-Verwendung
+
+Eine andere Möglichkeit, 🤗 Transformers offline zu verwenden, besteht darin, die Dateien im Voraus herunterzuladen und dann auf ihren lokalen Pfad zu verweisen, wenn Sie sie offline verwenden müssen. Es gibt drei Möglichkeiten, dies zu tun:
+
+* Laden Sie eine Datei über die Benutzeroberfläche des [Model Hub](https://huggingface.co/models) herunter, indem Sie auf das ↓-Symbol klicken.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+* Verwenden Sie den [PreTrainedModel.from_pretrained] und [PreTrainedModel.save_pretrained] Workflow:
+
+    1. Laden Sie Ihre Dateien im Voraus mit [`PreTrainedModel.from_pretrained`] herunter:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Speichern Sie Ihre Dateien in einem bestimmten Verzeichnis mit [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Wenn Sie nun offline sind, laden Sie Ihre Dateien mit [`PreTrainedModel.from_pretrained`] aus dem bestimmten Verzeichnis:
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+* Programmatisches Herunterladen von Dateien mit der [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) Bibliothek:
+
+    1. Installieren Sie die "huggingface_hub"-Bibliothek in Ihrer virtuellen Umgebung:
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Verwenden Sie die Funktion [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub), um eine Datei in einen bestimmten Pfad herunterzuladen. Der folgende Befehl lädt zum Beispiel die Datei "config.json" aus dem Modell [T0](https://huggingface.co/bigscience/T0_3B) in den gewünschten Pfad herunter:
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Sobald Ihre Datei heruntergeladen und lokal zwischengespeichert ist, geben Sie den lokalen Pfad an, um sie zu laden und zu verwenden:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Weitere Informationen zum Herunterladen von Dateien, die auf dem Hub gespeichert sind, finden Sie im Abschnitt [Wie man Dateien vom Hub herunterlädt](https://huggingface.co/docs/hub/how-to-downstream).
+
+</Tip>
diff --git a/docs/transformers/docs/source/de/llm_tutorial.md b/docs/transformers/docs/source/de/llm_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..ea4a96632cb1de97864c88a9107b262325c5618c
--- /dev/null
+++ b/docs/transformers/docs/source/de/llm_tutorial.md
@@ -0,0 +1,221 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Generation with LLMs
+
+[[open-in-colab]]
+
+LLMs (Large Language Models) sind die Schlüsselkomponente bei der Texterstellung. Kurz gesagt, bestehen sie aus großen, vortrainierten Transformationsmodellen, die darauf trainiert sind, das nächste Wort (oder genauer gesagt Token) aus einem Eingabetext vorherzusagen. Da sie jeweils ein Token vorhersagen, müssen Sie etwas Aufwändigeres tun, um neue Sätze zu generieren, als nur das Modell aufzurufen - Sie müssen eine autoregressive Generierung durchführen.
+
+Die autoregressive Generierung ist ein Verfahren zur Inferenzzeit, bei dem ein Modell mit seinen eigenen generierten Ausgaben iterativ aufgerufen wird, wenn einige anfängliche Eingaben vorliegen. In 🤗 Transformers wird dies von der Methode [`~generation.GenerationMixin.generate`] übernommen, die allen Modellen mit generativen Fähigkeiten zur Verfügung steht.
+
+Dieses Tutorial zeigt Ihnen, wie Sie:
+
+* Text mit einem LLM generieren
+* Vermeiden Sie häufige Fallstricke
+* Nächste Schritte, damit Sie das Beste aus Ihrem LLM herausholen können
+
+Bevor Sie beginnen, stellen Sie sicher, dass Sie alle erforderlichen Bibliotheken installiert haben:
+
+```bash
+pip install transformers bitsandbytes>=0.39.0 -q
+```
+
+
+## Text generieren
+
+Ein Sprachmodell, das für [causal language modeling](tasks/language_modeling) trainiert wurde, nimmt eine Folge von Text-Token als Eingabe und gibt die Wahrscheinlichkeitsverteilung für das nächste Token zurück.
+
+<!-- [GIF 1 -- FWD PASS] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_1_1080p.mov"
+    ></video>
+    <figcaption>"Forward pass of an LLM"</figcaption>
+</figure>
+
+Ein wichtiger Aspekt der autoregressiven Generierung mit LLMs ist die Auswahl des nächsten Tokens aus dieser Wahrscheinlichkeitsverteilung. In diesem Schritt ist alles möglich, solange Sie am Ende ein Token für die nächste Iteration haben. Das heißt, es kann so einfach sein wie die Auswahl des wahrscheinlichsten Tokens aus der Wahrscheinlichkeitsverteilung oder so komplex wie die Anwendung von einem Dutzend Transformationen vor der Stichprobenziehung aus der resultierenden Verteilung.
+
+<!-- [GIF 2 -- TEXT GENERATION] -->
+<figure class="image table text-center m-0 w-full">
+    <video
+        style="max-width: 90%; margin: auto;"
+        autoplay loop muted playsinline
+        src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/assisted-generation/gif_2_1080p.mov"
+    ></video>
+    <figcaption>"Die autoregressive Generierung wählt iterativ das nächste Token aus einer Wahrscheinlichkeitsverteilung aus, um Text zu erzeugen"</figcaption>
+</figure>
+
+Der oben dargestellte Prozess wird iterativ wiederholt, bis eine bestimmte Abbruchbedingung erreicht ist. Im Idealfall wird die Abbruchbedingung vom Modell vorgegeben, das lernen sollte, wann es ein Ende-der-Sequenz-Token (EOS) ausgeben muss. Ist dies nicht der Fall, stoppt die Generierung, wenn eine vordefinierte Maximallänge erreicht ist.
+
+Damit sich Ihr Modell so verhält, wie Sie es für Ihre Aufgabe erwarten, müssen Sie den Schritt der Token-Auswahl und die Abbruchbedingung richtig einstellen. Aus diesem Grund haben wir zu jedem Modell eine [`~generation.GenerationConfig`]-Datei, die eine gute generative Standardparametrisierung enthält und zusammen mit Ihrem Modell geladen wird.
+
+Lassen Sie uns über Code sprechen!
+
+<Tip>
+
+Wenn Sie an der grundlegenden Verwendung von LLMs interessiert sind, ist unsere High-Level-Schnittstelle [`Pipeline`](pipeline_tutorial) ein guter Ausgangspunkt. LLMs erfordern jedoch oft fortgeschrittene Funktionen wie Quantisierung und Feinsteuerung des Token-Auswahlschritts, was am besten über [`~generation.GenerationMixin.generate`] erfolgt. Die autoregressive Generierung mit LLMs ist ebenfalls ressourcenintensiv und sollte für einen angemessenen Durchsatz auf einer GPU ausgeführt werden.
+
+</Tip>
+
+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
+Zunächst müssen Sie das Modell laden.
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+Sie werden zwei Flags in dem Aufruf `from_pretrained` bemerken:
+
+ - `device_map` stellt sicher, dass das Modell auf Ihre GPU(s) übertragen wird
+ - `load_in_4bit` wendet [dynamische 4-Bit-Quantisierung](main_classes/quantization) an, um die Ressourcenanforderungen massiv zu reduzieren
+
+Es gibt noch andere Möglichkeiten, ein Modell zu initialisieren, aber dies ist eine gute Grundlage, um mit einem LLM zu beginnen.
+
+Als nächstes müssen Sie Ihre Texteingabe mit einem [tokenizer](tokenizer_summary) vorverarbeiten.
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+```
+
+Die Variable `model_inputs` enthält die tokenisierte Texteingabe sowie die Aufmerksamkeitsmaske. Obwohl [`~generation.GenerationMixin.generate`] sein Bestes tut, um die Aufmerksamkeitsmaske abzuleiten, wenn sie nicht übergeben wird, empfehlen wir, sie für optimale Ergebnisse wann immer möglich zu übergeben.
+
+Rufen Sie schließlich die Methode [`~generation.GenerationMixin.generate`] auf, um die generierten Token zurückzugeben, die vor dem Drucken in Text umgewandelt werden sollten.
+
+```py
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A list of colors: red, blue, green, yellow, black, white, and brown'
+```
+
+Und das war's! Mit ein paar Zeilen Code können Sie sich die Macht eines LLM zunutze machen.
+
+
+## Häufige Fallstricke
+
+Es gibt viele [Generierungsstrategien](generation_strategies), und manchmal sind die Standardwerte für Ihren Anwendungsfall vielleicht nicht geeignet. Wenn Ihre Ausgaben nicht mit dem übereinstimmen, was Sie erwarten, haben wir eine Liste der häufigsten Fallstricke erstellt und wie Sie diese vermeiden können.
+
+```py
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model = AutoModelForCausalLM.from_pretrained(
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+... )
+```
+
+### Generierte Ausgabe ist zu kurz/lang
+
+Wenn in der Datei [`~generation.GenerationConfig`] nichts angegeben ist, gibt `generate` standardmäßig bis zu 20 Token zurück. Wir empfehlen dringend, `max_new_tokens` in Ihrem `generate`-Aufruf manuell zu setzen, um die maximale Anzahl neuer Token zu kontrollieren, die zurückgegeben werden können. Beachten Sie, dass LLMs (genauer gesagt, [decoder-only models](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)) auch die Eingabeaufforderung als Teil der Ausgabe zurückgeben.
+
+
+```py
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+
+>>> # By default, the output will contain up to 20 tokens
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5'
+
+>>> # Setting `max_new_tokens` allows you to control the maximum length
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
+```
+
+### Falscher Generierungsmodus
+
+Standardmäßig und sofern nicht in der Datei [`~generation.GenerationConfig`] angegeben, wählt `generate` bei jeder Iteration das wahrscheinlichste Token aus (gierige Dekodierung). Je nach Aufgabe kann dies unerwünscht sein; kreative Aufgaben wie Chatbots oder das Schreiben eines Aufsatzes profitieren vom Sampling. Andererseits profitieren Aufgaben, bei denen es auf die Eingabe ankommt, wie z.B. Audiotranskription oder Übersetzung, von der gierigen Dekodierung. Aktivieren Sie das Sampling mit `do_sample=True`. Mehr zu diesem Thema erfahren Sie in diesem [Blogbeitrag](https://huggingface.co/blog/how-to-generate).
+
+```py
+>>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
+>>> from transformers import set_seed
+>>> set_seed(0)
+
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+
+>>> # LLM + greedy decoding = repetitive, boring output
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat. I am a cat. I am a cat. I am a cat'
+
+>>> # With sampling, the output becomes more creative!
+>>> generated_ids = model.generate(**model_inputs, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'I am a cat.\nI just need to be. I am always.\nEvery time'
+```
+
+### Falsche Auffüllseite
+
+LLMs sind [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt)-Architekturen, d.h. sie iterieren weiter über Ihre Eingabeaufforderung. Wenn Ihre Eingaben nicht die gleiche Länge haben, müssen sie aufgefüllt werden. Da LLMs nicht darauf trainiert sind, mit aufgefüllten Token fortzufahren, muss Ihre Eingabe links aufgefüllt werden. Vergessen Sie auch nicht, die Aufmerksamkeitsmaske an generate zu übergeben!
+
+```py
+>>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
+>>> # which is shorter, has padding on the right side. Generation fails.
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''
+
+>>> # With left-padding, it works as expected!
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> model_inputs = tokenizer(
+...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
+... ).to("cuda")
+>>> generated_ids = model.generate(**model_inputs)
+>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+'1, 2, 3, 4, 5, 6,'
+```
+
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->
+
+## Weitere Ressourcen
+
+Während der Prozess der autoregressiven Generierung relativ einfach ist, kann die optimale Nutzung Ihres LLM ein schwieriges Unterfangen sein, da es viele bewegliche Teile gibt. Für Ihre nächsten Schritte, die Ihnen helfen, tiefer in die LLM-Nutzung und das Verständnis einzutauchen:
+
+<!-- TODO: mit neuen Anleitungen vervollständigen -->
+### Fortgeschrittene Nutzung generieren
+
+1. [Leitfaden](generation_strategies) zur Steuerung verschiedener Generierungsmethoden, zur Einrichtung der Generierungskonfigurationsdatei und zum Streaming der Ausgabe;
+2. API-Referenz zu [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`] und [generate-bezogene Klassen](internal/generation_utils).
+
+### LLM-Ranglisten
+
+1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), das sich auf die Qualität der Open-Source-Modelle konzentriert;
+2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), das sich auf den LLM-Durchsatz konzentriert.
+
+### Latenz und Durchsatz
+
+1. [Leitfaden](main_classes/quantization) zur dynamischen Quantisierung, der Ihnen zeigt, wie Sie Ihren Speicherbedarf drastisch reduzieren können.
+
+### Verwandte Bibliotheken
+
+1. [text-generation-inference](https://github.com/huggingface/text-generation-inference), ein produktionsreifer Server für LLMs;
+2. [`optimum`](https://github.com/huggingface/optimum), eine Erweiterung von 🤗 Transformers, die für bestimmte Hardware-Geräte optimiert.
diff --git a/docs/transformers/docs/source/de/model_sharing.md b/docs/transformers/docs/source/de/model_sharing.md
new file mode 100644
index 0000000000000000000000000000000000000000..850d9a3454a9c194d47989cf2e42bbb29f093642
--- /dev/null
+++ b/docs/transformers/docs/source/de/model_sharing.md
@@ -0,0 +1,232 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Ein Modell teilen
+
+Die letzten beiden Tutorials haben gezeigt, wie man ein Modell mit PyTorch, Keras und 🤗 Accelerate für verteilte Setups feinabstimmen kann. Der nächste Schritt besteht darin, Ihr Modell mit der Community zu teilen! Bei Hugging Face glauben wir an den offenen Austausch von Wissen und Ressourcen, um künstliche Intelligenz für alle zu demokratisieren. Wir ermutigen Sie, Ihr Modell mit der Community zu teilen, um anderen zu helfen, Zeit und Ressourcen zu sparen.
+
+In diesem Tutorial lernen Sie zwei Methoden kennen, wie Sie ein trainiertes oder verfeinertes Modell auf dem [Model Hub](https://huggingface.co/models) teilen können:
+
+- Programmgesteuertes Übertragen Ihrer Dateien auf den Hub.
+- Ziehen Sie Ihre Dateien per Drag-and-Drop über die Weboberfläche in den Hub.
+
+<iframe width="560" height="315" src="https://www.youtube.com/embed/XvSGPZFEjDY" title="YouTube video player"
+frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
+picture-in-picture" allowfullscreen></iframe>
+
+<Tip>
+
+Um ein Modell mit der Öffentlichkeit zu teilen, benötigen Sie ein Konto auf [huggingface.co](https://huggingface.co/join). Sie können auch einer bestehenden Organisation beitreten oder eine neue Organisation gründen.
+
+</Tip>
+
+## Repository-Funktionen
+
+Jedes Repository im Model Hub verhält sich wie ein typisches GitHub-Repository. Unsere Repositorys bieten Versionierung, Commit-Historie und die Möglichkeit, Unterschiede zu visualisieren.
+
+Die integrierte Versionierung des Model Hub basiert auf Git und [git-lfs](https://git-lfs.github.com/). Mit anderen Worten: Sie können ein Modell als ein Repository behandeln, was eine bessere Zugriffskontrolle und Skalierbarkeit ermöglicht. Die Versionskontrolle ermöglicht *Revisionen*, eine Methode zum Anheften einer bestimmten Version eines Modells mit einem Commit-Hash, Tag oder Branch.
+
+Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" laden:
+
+```py
+>>> model = AutoModel.from_pretrained(
+...     "julien-c/EsperBERTo-small", revision="4c77982"  # tag name, or branch name, or commit hash
+... )
+```
+
+Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können die Commit-Historie sowie die Unterschiede einsehen:
+
+![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)
+
+## Einrichtung
+
+Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:
+
+```bash
+huggingface-cli login
+```
+
+Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
+
+```bash
+pip install huggingface_hub
+```
+
+Verwenden Sie dann `notebook_login`, um sich beim Hub anzumelden, und folgen Sie dem Link [hier](https://huggingface.co/settings/token), um ein Token für die Anmeldung zu generieren:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Ein Modell für alle Frameworks konvertieren
+
+Um sicherzustellen, dass Ihr Modell von jemandem verwendet werden kann, der mit einem anderen Framework arbeitet, empfehlen wir Ihnen, Ihr Modell sowohl mit PyTorch- als auch mit TensorFlow-Checkpoints zu konvertieren und hochzuladen. Während Benutzer immer noch in der Lage sind, Ihr Modell von einem anderen Framework zu laden, wenn Sie diesen Schritt überspringen, wird es langsamer sein, weil 🤗 Transformers den Checkpoint on-the-fly konvertieren müssen.
+
+Die Konvertierung eines Checkpoints für ein anderes Framework ist einfach. Stellen Sie sicher, dass Sie PyTorch und TensorFlow installiert haben (siehe [hier](installation) für Installationsanweisungen), und finden Sie dann das spezifische Modell für Ihre Aufgabe in dem anderen Framework. 
+
+<frameworkcontent>
+<pt>
+Geben Sie `from_tf=True` an, um einen Prüfpunkt von TensorFlow nach PyTorch zu konvertieren:
+
+```py
+>>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+>>> pt_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</pt>
+<tf>
+Geben Sie `from_pt=True` an, um einen Prüfpunkt von PyTorch nach TensorFlow zu konvertieren:
+
+```py
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+```
+
+Dann können Sie Ihr neues TensorFlow-Modell mit seinem neuen Checkpoint speichern:
+
+```py
+>>> tf_model.save_pretrained("path/to/awesome-name-you-picked")
+```
+</tf>
+<jax>
+Wenn ein Modell in Flax verfügbar ist, können Sie auch einen Kontrollpunkt von PyTorch nach Flax konvertieren:
+
+```py
+>>> flax_model = FlaxDistilBertForSequenceClassification.from_pretrained(
+...     "path/to/awesome-name-you-picked", from_pt=True
+... )
+```
+</jax>
+</frameworkcontent>
+
+## Ein Modell während des Trainings hochladen
+
+<frameworkcontent>
+<pt>
+<Youtube id="Z1-XMy-GNLQ"/>
+
+Die Weitergabe eines Modells an den Hub ist so einfach wie das Hinzufügen eines zusätzlichen Parameters oder Rückrufs. Erinnern Sie sich an das [Feinabstimmungs-Tutorial](training), in der Klasse [`TrainingArguments`] geben Sie Hyperparameter und zusätzliche Trainingsoptionen an. Eine dieser Trainingsoptionen beinhaltet die Möglichkeit, ein Modell direkt an den Hub zu pushen. Setzen Sie `push_to_hub=True` in Ihrer [`TrainingArguments`]:
+
+```py
+>>> training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True)
+```
+
+Übergeben Sie Ihre Trainingsargumente wie gewohnt an [`Trainer`]:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Nach der Feinabstimmung Ihres Modells rufen Sie [`~transformers.Trainer.push_to_hub`] auf [`Trainer`] auf, um das trainierte Modell an den Hub zu übertragen. Transformers fügt sogar automatisch Trainings-Hyperparameter, Trainingsergebnisse und Framework-Versionen zu Ihrer Modellkarte hinzu!
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+Geben Sie ein Modell mit [`PushToHubCallback`] an den Hub weiter. In der [`PushToHubCallback`] Funktion, fügen Sie hinzu:
+
+- Ein Ausgabeverzeichnis für Ihr Modell.
+- Einen Tokenizer.
+- Die `hub_model_id`, die Ihr Hub-Benutzername und Modellname ist.
+
+```py
+>>> from transformers import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model"
+... )
+```
+
+Fügen Sie den Callback zu [`fit`](https://keras.io/api/models/model_training_apis/) hinzu, und 🤗 Transformers wird das trainierte Modell an den Hub weiterleiten:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback)
+```
+</tf>
+</frameworkcontent>
+
+## Verwenden Sie die Funktion `push_to_hub`.
+
+Sie können `push_to_hub` auch direkt für Ihr Modell aufrufen, um es in den Hub hochzuladen.
+
+Geben Sie den Namen Ihres Modells in "push_to_hub" an:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-model")
+```
+
+Dadurch wird ein Repository unter Ihrem Benutzernamen mit dem Modellnamen `my-awesome-model` erstellt. Benutzer können nun Ihr Modell mit der Funktion `from_pretrained` laden:
+
+```py
+>>> from transformers import AutoModel
+
+>>> model = AutoModel.from_pretrained("your_username/my-awesome-model")
+```
+
+Wenn Sie zu einer Organisation gehören und Ihr Modell stattdessen unter dem Namen der Organisation pushen wollen, fügen Sie diesen einfach zur `repo_id` hinzu:
+
+```py
+>>> pt_model.push_to_hub("my-awesome-org/my-awesome-model")
+```
+
+Die Funktion "push_to_hub" kann auch verwendet werden, um andere Dateien zu einem Modell-Repository hinzuzufügen. Zum Beispiel kann man einen Tokenizer zu einem Modell-Repository hinzufügen:
+
+```py
+>>> tokenizer.push_to_hub("my-awesome-model")
+```
+
+Oder vielleicht möchten Sie die TensorFlow-Version Ihres fein abgestimmten PyTorch-Modells hinzufügen:
+
+```py
+>>> tf_model.push_to_hub("my-awesome-model")
+```
+
+Wenn Sie nun zu Ihrem Hugging Face-Profil navigieren, sollten Sie Ihr neu erstelltes Modell-Repository sehen. Wenn Sie auf die Registerkarte **Dateien** klicken, werden alle Dateien angezeigt, die Sie in das Repository hochgeladen haben.
+
+Weitere Einzelheiten zum Erstellen und Hochladen von Dateien in ein Repository finden Sie in der Hub-Dokumentation [hier](https://huggingface.co/docs/hub/how-to-upstream).
+
+## Hochladen mit der Weboberfläche
+
+Benutzer, die einen no-code Ansatz bevorzugen, können ein Modell über das Webinterface des Hubs hochladen. Besuchen Sie [huggingface.co/new](https://huggingface.co/new) um ein neues Repository zu erstellen:
+
+![new_model_repo](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/new_model_repo.png)
+
+Fügen Sie von hier aus einige Informationen über Ihr Modell hinzu:
+
+- Wählen Sie den **Besitzer** des Repositorys. Dies können Sie selbst oder eine der Organisationen sein, denen Sie angehören.
+- Wählen Sie einen Namen für Ihr Modell, der auch der Name des Repositorys sein wird.
+- Wählen Sie, ob Ihr Modell öffentlich oder privat ist.
+- Geben Sie die Lizenzverwendung für Ihr Modell an.
+
+Klicken Sie nun auf die Registerkarte **Dateien** und klicken Sie auf die Schaltfläche **Datei hinzufügen**, um eine neue Datei in Ihr Repository hochzuladen. Ziehen Sie dann eine Datei per Drag-and-Drop hoch und fügen Sie eine Übergabemeldung hinzu.
+
+![upload_file](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/upload_file.png)
+
+## Hinzufügen einer Modellkarte
+
+Um sicherzustellen, dass die Benutzer die Fähigkeiten, Grenzen, möglichen Verzerrungen und ethischen Aspekte Ihres Modells verstehen, fügen Sie bitte eine Modellkarte zu Ihrem Repository hinzu. Die Modellkarte wird in der Datei `README.md` definiert. Sie können eine Modellkarte hinzufügen, indem Sie:
+
+* Manuelles Erstellen und Hochladen einer "README.md"-Datei.
+* Klicken Sie auf die Schaltfläche **Modellkarte bearbeiten** in Ihrem Modell-Repository.
+
+Werfen Sie einen Blick auf die DistilBert [model card](https://huggingface.co/distilbert/distilbert-base-uncased) als gutes Beispiel für die Art von Informationen, die eine Modellkarte enthalten sollte. Weitere Details über andere Optionen, die Sie in der Datei "README.md" einstellen können, wie z.B. den Kohlenstoff-Fußabdruck eines Modells oder Beispiele für Widgets, finden Sie in der Dokumentation [hier](https://huggingface.co/docs/hub/models-cards).
\ No newline at end of file
diff --git a/docs/transformers/docs/source/de/peft.md b/docs/transformers/docs/source/de/peft.md
new file mode 100644
index 0000000000000000000000000000000000000000..eda8ce9435a0556a7bcbd9cdf432d8fc23fe67eb
--- /dev/null
+++ b/docs/transformers/docs/source/de/peft.md
@@ -0,0 +1,216 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Adapter mit 🤗 PEFT laden
+
+[[open-in-colab]]
+
+Die [Parameter-Efficient Fine Tuning (PEFT)](https://huggingface.co/blog/peft) Methoden frieren die vorab trainierten Modellparameter während der Feinabstimmung ein und fügen eine kleine Anzahl trainierbarer Parameter (die Adapter) hinzu. Die Adapter werden trainiert, um aufgabenspezifische Informationen zu lernen. Es hat sich gezeigt, dass dieser Ansatz sehr speichereffizient ist und weniger Rechenleistung beansprucht, während die Ergebnisse mit denen eines vollständig feinabgestimmten Modells vergleichbar sind. 
+
+Adapter, die mit PEFT trainiert wurden, sind in der Regel um eine Größenordnung kleiner als das vollständige Modell, so dass sie bequem gemeinsam genutzt, gespeichert und geladen werden können.
+
+<div class="flex flex-col justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/PEFT-hub-screenshot.png"/>
+  <figcaption class="text-center">Die Adaptergewichte für ein OPTForCausalLM-Modell, die auf dem Hub gespeichert sind, sind nur ~6MB groß, verglichen mit der vollen Größe der Modellgewichte, die ~700MB betragen können.</figcaption>
+</div>
+
+Wenn Sie mehr über die 🤗 PEFT-Bibliothek erfahren möchten, sehen Sie sich die [Dokumentation](https://huggingface.co/docs/peft/index) an.
+
+## Setup
+
+Starten Sie mit der Installation von 🤗 PEFT:
+
+```bash
+pip install peft
+```
+
+Wenn Sie die brandneuen Funktionen ausprobieren möchten, sollten Sie die Bibliothek aus dem Quellcode installieren:
+
+```bash
+pip install git+https://github.com/huggingface/peft.git
+```
+
+## Unterstützte PEFT-Modelle
+
+Transformers unterstützt nativ einige PEFT-Methoden, d.h. Sie können lokal oder auf dem Hub gespeicherte Adaptergewichte laden und sie mit wenigen Zeilen Code einfach ausführen oder trainieren. Die folgenden Methoden werden unterstützt:
+
+- [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora)
+- [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
+- [AdaLoRA](https://arxiv.org/abs/2303.10512)
+
+Wenn Sie andere PEFT-Methoden, wie z.B. Prompt Learning oder Prompt Tuning, verwenden möchten, oder über die 🤗 PEFT-Bibliothek im Allgemeinen, lesen Sie bitte die [Dokumentation](https://huggingface.co/docs/peft/index).
+
+
+## Laden Sie einen PEFT-Adapter
+
+Um ein PEFT-Adaptermodell von 🤗 Transformers zu laden und zu verwenden, stellen Sie sicher, dass das Hub-Repository oder das lokale Verzeichnis eine `adapter_config.json`-Datei und die Adaptergewichte enthält, wie im obigen Beispielbild gezeigt. Dann können Sie das PEFT-Adaptermodell mit der Klasse `AutoModelFor` laden. Um zum Beispiel ein PEFT-Adaptermodell für die kausale Sprachmodellierung zu laden:
+
+1. Geben Sie die PEFT-Modell-ID an.
+2. übergeben Sie es an die Klasse [`AutoModelForCausalLM`].
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id)
+```
+
+<Tip>
+
+Sie können einen PEFT-Adapter entweder mit einer `AutoModelFor`-Klasse oder der Basismodellklasse wie `OPTForCausalLM` oder `LlamaForCausalLM` laden.
+
+</Tip>
+
+Sie können einen PEFT-Adapter auch laden, indem Sie die Methode `load_adapter` aufrufen:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "facebook/opt-350m"
+peft_model_id = "ybelkada/opt-350m-lora"
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model.load_adapter(peft_model_id)
+```
+
+## Laden in 8bit oder 4bit
+
+Die `bitsandbytes`-Integration unterstützt Datentypen mit 8bit und 4bit Genauigkeit, was für das Laden großer Modelle nützlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). Fügen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+peft_model_id = "ybelkada/opt-350m-lora"
+model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True))
+```
+
+## Einen neuen Adapter hinzufügen
+
+Sie können [`~peft.PeftModel.add_adapter`] verwenden, um einen neuen Adapter zu einem Modell mit einem bestehenden Adapter hinzuzufügen, solange der neue Adapter vom gleichen Typ ist wie der aktuelle Adapter. Wenn Sie zum Beispiel einen bestehenden LoRA-Adapter an ein Modell angehängt haben:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+lora_config = LoraConfig(
+    target_modules=["q_proj", "k_proj"],
+    init_lora_weights=False
+)
+
+model.add_adapter(lora_config, adapter_name="adapter_1")
+```
+
+Um einen neuen Adapter hinzuzufügen:
+
+```py
+# attach new adapter with same config
+model.add_adapter(lora_config, adapter_name="adapter_2")
+```
+
+Jetzt können Sie mit [`~peft.PeftModel.set_adapter`] festlegen, welcher Adapter verwendet werden soll:
+
+```py
+# use adapter_1
+model.set_adapter("adapter_1")
+output = model.generate(**inputs)
+print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))
+
+# use adapter_2
+model.set_adapter("adapter_2")
+output_enabled = model.generate(**inputs)
+print(tokenizer.decode(output_enabled[0], skip_special_tokens=True))
+```
+
+## Aktivieren und Deaktivieren von Adaptern
+
+Sobald Sie einen Adapter zu einem Modell hinzugefügt haben, können Sie das Adaptermodul aktivieren oder deaktivieren. So aktivieren Sie das Adaptermodul:
+
+```py
+from transformers import AutoModelForCausalLM, OPTForCausalLM, AutoTokenizer
+from peft import PeftConfig
+
+model_id = "facebook/opt-350m"
+adapter_model_id = "ybelkada/opt-350m-lora"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text = "Hello"
+inputs = tokenizer(text, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(model_id)
+peft_config = PeftConfig.from_pretrained(adapter_model_id)
+
+# to initiate with random weights
+peft_config.init_lora_weights = False
+
+model.add_adapter(peft_config)
+model.enable_adapters()
+output = model.generate(**inputs)
+```
+
+So deaktivieren Sie das Adaptermodul:
+
+```py
+model.disable_adapters()
+output = model.generate(**inputs)
+```
+
+## PEFT-Adapter trainieren
+
+PEFT-Adapter werden von der Klasse [`Trainer`] unterstützt, so dass Sie einen Adapter für Ihren speziellen Anwendungsfall trainieren können. Dazu müssen Sie nur ein paar weitere Codezeilen hinzufügen. Zum Beispiel, um einen LoRA-Adapter zu trainieren:
+
+<Tip>
+
+Wenn Sie mit der Feinabstimmung eines Modells mit [`Trainer`] noch nicht vertraut sind, werfen Sie einen Blick auf das Tutorial [Feinabstimmung eines vortrainierten Modells](Training).
+
+</Tip>
+
+1. Definieren Sie Ihre Adapterkonfiguration mit dem Aufgabentyp und den Hyperparametern (siehe [`~peft.LoraConfig`] für weitere Details darüber, was die Hyperparameter tun).
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+```
+
+2. Fügen Sie dem Modell einen Adapter hinzu.
+
+```py
+model.add_adapter(peft_config)
+```
+
+3. Jetzt können Sie das Modell an [`Trainer`] übergeben!
+
+```py
+trainer = Trainer(model=model, ...)
+trainer.train()
+```
+
+So speichern Sie Ihren trainierten Adapter und laden ihn wieder:
+
+```py
+model.save_pretrained(save_dir)
+model = AutoModelForCausalLM.from_pretrained(save_dir)
+```
+
+<!--
+TODO: (@younesbelkada @stevhliu)
+-   Link to PEFT docs for further details
+-   Trainer  
+-   8-bit / 4-bit examples ?
+-->
diff --git a/docs/transformers/docs/source/de/pipeline_tutorial.md b/docs/transformers/docs/source/de/pipeline_tutorial.md
new file mode 100644
index 0000000000000000000000000000000000000000..5106af9b2fafc76564f8e895c53d3b8146e45aeb
--- /dev/null
+++ b/docs/transformers/docs/source/de/pipeline_tutorial.md
@@ -0,0 +1,175 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines für Inferenzen
+
+Die [`pipeline`] macht es einfach, jedes beliebige Modell aus dem [Hub](https://huggingface.co/models) für die Inferenz auf jede Sprache, Computer Vision, Sprache und multimodale Aufgaben zu verwenden. Selbst wenn Sie keine Erfahrung mit einer bestimmten Modalität haben oder nicht mit dem zugrundeliegenden Code hinter den Modellen vertraut sind, können Sie sie mit der [`pipeline`] für Inferenzen verwenden! In diesem Beispiel lernen Sie, wie:
+
+* Eine [`pipeline`] für Inferenz zu verwenden.
+* Einen bestimmten Tokenizer oder ein bestimmtes Modell zu verwenden.
+* Eine [`pipeline`] für Audio-, Vision- und multimodale Aufgaben zu verwenden.
+
+<Tip>
+
+Eine vollständige Liste der unterstützten Aufgaben und verfügbaren Parameter finden Sie in der [`pipeline`]-Dokumentation.
+
+</Tip>
+
+## Verwendung von Pipelines
+
+Obwohl jede Aufgabe eine zugehörige [`pipeline`] hat, ist es einfacher, die allgemeine [`pipeline`]-Abstraktion zu verwenden, die alle aufgabenspezifischen Pipelines enthält. Die [`pipeline`] lädt automatisch ein Standardmodell und eine Vorverarbeitungsklasse, die für Ihre Aufgabe inferenzfähig ist.
+
+1. Beginnen Sie mit der Erstellung einer [`pipeline`] und geben Sie eine Inferenzaufgabe an:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation")
+```
+
+2. Übergeben Sie Ihren Eingabetext an die [`pipeline`]:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Iron-priests at the door to the east, and thirteen for the Lord Kings at the end of the mountain'}]
+```
+
+Wenn Sie mehr als eine Eingabe haben, übergeben Sie die Eingabe als Liste:
+
+```py
+>>> generator(
+...     [
+...         "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...         "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
+...     ]
+... )  # doctest: +SKIP
+```
+
+Alle zusätzlichen Parameter für Ihre Aufgabe können auch in die [`pipeline`] aufgenommen werden. Die Aufgabe `Text-Generierung` hat eine [`~generation.GenerationMixin.generate`]-Methode mit mehreren Parametern zur Steuerung der Ausgabe. Wenn Sie zum Beispiel mehr als eine Ausgabe erzeugen wollen, setzen Sie den Parameter `num_return_sequences`:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
+...     num_return_sequences=2,
+... )  # doctest: +SKIP
+```
+
+### Wählen Sie ein Modell und einen Tokenizer
+
+Die [`pipeline`] akzeptiert jedes Modell aus dem [Hub](https://huggingface.co/models). Auf dem Hub gibt es Tags, mit denen Sie nach einem Modell filtern können, das Sie für Ihre Aufgabe verwenden möchten. Sobald Sie ein passendes Modell ausgewählt haben, laden Sie es mit der entsprechenden `AutoModelFor` und [`AutoTokenizer`] Klasse. Laden Sie zum Beispiel die Klasse [`AutoModelForCausalLM`] für eine kausale Sprachmodellierungsaufgabe:
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
+>>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
+```
+
+Erstellen Sie eine [`pipeline`] für Ihre Aufgabe, und geben Sie das Modell und den Tokenizer an, die Sie geladen haben:
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
+```
+
+Übergeben Sie Ihren Eingabetext an die [`pipeline`] , um einen Text zu erzeugen:
+
+```py
+>>> generator(
+...     "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
+... )  # doctest: +SKIP
+[{'generated_text': 'Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone, Seven for the Dragon-lords (for them to rule in a world ruled by their rulers, and all who live within the realm'}]
+```
+
+## Audio-Pipeline
+
+Die [`pipeline`] unterstützt auch Audioaufgaben wie Audioklassifizierung und automatische Spracherkennung.
+
+Lassen Sie uns zum Beispiel die Emotion in diesem Audioclip klassifizieren:
+
+```py
+>>> from datasets import load_dataset
+>>> import torch
+
+>>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
+>>> ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
+>>> audio_file = ds[0]["audio"]["path"]
+```
+
+Finden Sie ein [Audioklassifikation](https://huggingface.co/models?pipeline_tag=audio-classification) Modell auf dem Model Hub für Emotionserkennung und laden Sie es in die [`pipeline`]:
+
+```py
+>>> from transformers import pipeline
+
+>>> audio_classifier = pipeline(
+...     task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+Übergeben Sie die Audiodatei an die [`pipeline`]:
+
+```py
+>>> preds = audio_classifier(audio_file)
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.1315, 'label': 'calm'}, {'score': 0.1307, 'label': 'neutral'}, {'score': 0.1274, 'label': 'sad'}, {'score': 0.1261, 'label': 'fearful'}, {'score': 0.1242, 'label': 'happy'}]
+```
+
+## Bildverarbeitungs-Pipeline
+
+Die Verwendung einer [`pipeline`] für Bildverarbeitungsaufgaben ist praktisch identisch.
+
+Geben Sie Ihre Aufgabe an und übergeben Sie Ihr Bild an den Klassifikator. Das Bild kann ein Link oder ein lokaler Pfad zu dem Bild sein. Zum Beispiel: Welche Katzenart ist unten abgebildet?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(task="image-classification")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## Multimodale Pipeline
+
+Die [`pipeline`] unterstützt mehr als eine Modalität. Eine Aufgabe zur Beantwortung visueller Fragen (VQA) kombiniert zum Beispiel Text und Bild. Verwenden Sie einen beliebigen Bildlink und eine Frage, die Sie zu dem Bild stellen möchten. Das Bild kann eine URL oder ein lokaler Pfad zu dem Bild sein.
+
+Wenn Sie zum Beispiel das gleiche Bild wie in der obigen Vision-Pipeline verwenden:
+
+```py
+>>> image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+>>> question = "Where is the cat?"
+```
+
+Erstellen Sie eine Pipeline für "vqa" und übergeben Sie ihr das Bild und die Frage:
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(task="vqa")
+>>> preds = vqa(image=image, question=question)
+>>> preds = [{"score": round(pred["score"], 4), "answer": pred["answer"]} for pred in preds]
+>>> preds
+[{'score': 0.9112, 'answer': 'snow'}, {'score': 0.8796, 'answer': 'in snow'}, {'score': 0.6717, 'answer': 'outside'}, {'score': 0.0291, 'answer': 'on ground'}, {'score': 0.027, 'answer': 'ground'}]
+```
diff --git a/docs/transformers/docs/source/de/pr_checks.md b/docs/transformers/docs/source/de/pr_checks.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee2bbf489b8e7535f277e6f95ab7062c2e52020c
--- /dev/null
+++ b/docs/transformers/docs/source/de/pr_checks.md
@@ -0,0 +1,199 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Überprüfungen bei einer Pull-Anfrage
+
+Wenn Sie eine Pull-Anfrage für 🤗 Transformers öffnen, wird eine ganze Reihe von Prüfungen durchgeführt, um sicherzustellen, dass der Patch, den Sie hinzufügen, nichts Bestehendes zerstört. Es gibt vier Arten von Prüfungen:
+- reguläre Tests
+- Erstellung der Dokumentation
+- Stil von Code und Dokumentation
+- allgemeine Konsistenz des Repository
+
+In diesem Dokument werden wir versuchen zu erklären, worum es sich bei diesen verschiedenen Prüfungen handelt und wie Sie sie lokal debuggen können, wenn eine der Prüfungen in Ihrer PR fehlschlägt.
+
+Beachten Sie, dass Sie im Idealfall eine Dev-Installation benötigen:
+
+```bash
+pip install transformers[dev]
+```
+
+oder für eine bearbeitbare Installation:
+
+```bash
+pip install -e .[dev]
+```
+
+innerhalb des Transformers Repo. Da die Anzahl der optionalen Abhängigkeiten von Transformers stark zugenommen hat, ist es möglich, dass Sie nicht alle davon bekommen können. Wenn die Dev-Installation fehlschlägt, stellen Sie sicher, dass Sie das Deep Learning-Framework, mit dem Sie arbeiten, installieren (PyTorch, TensorFlow und/oder Flax).
+
+```bash
+pip install transformers[quality]
+```
+
+oder für eine bearbeitbare Installation:
+
+```bash
+pip install -e .[quality]
+```
+
+
+## Tests
+
+Alle Jobs, die mit `ci/circleci: run_tests_` beginnen, führen Teile der Transformers-Testsuite aus. Jeder dieser Jobs konzentriert sich auf einen Teil der Bibliothek in einer bestimmten Umgebung: `ci/circleci: run_tests_pipelines_tf` zum Beispiel führt den Pipelines-Test in einer Umgebung aus, in der nur TensorFlow installiert ist.
+
+Beachten Sie, dass nur ein Teil der Testsuite jedes Mal ausgeführt wird, um zu vermeiden, dass Tests ausgeführt werden, wenn es keine wirkliche Änderung in den Modulen gibt, die sie testen: ein Dienstprogramm wird ausgeführt, um die Unterschiede in der Bibliothek zwischen vor und nach dem PR zu ermitteln (was GitHub Ihnen auf der Registerkarte "Files changes" anzeigt) und die Tests auszuwählen, die von diesem Unterschied betroffen sind. Dieses Dienstprogramm kann lokal mit ausgeführt werden:
+
+```bash
+python utils/tests_fetcher.py
+```
+
+aus dem Stammverzeichnis des Transformers-Repositoriums. Es wird:
+
+1. Überprüfen Sie für jede Datei im Diff, ob die Änderungen im Code oder nur in Kommentaren oder Docstrings enthalten sind. Nur die Dateien mit echten Codeänderungen werden beibehalten.
+2. Erstellen Sie eine interne Map, die für jede Datei des Quellcodes der Bibliothek alle Dateien angibt, auf die sie rekursiv Einfluss nimmt. Von Modul A wird gesagt, dass es sich auf Modul B auswirkt, wenn Modul B Modul A importiert. Für die rekursive Auswirkung benötigen wir eine Kette von Modulen, die von Modul A zu Modul B führt und in der jedes Modul das vorherige importiert.
+3. Wenden Sie diese Zuordnung auf die in Schritt 1 gesammelten Dateien an. So erhalten wir die Liste der Modelldateien, die von der PR betroffen sind.
+4. Ordnen Sie jede dieser Dateien der/den entsprechenden Testdatei(en) zu und erhalten Sie die Liste der auszuführenden Tests.
+
+Wenn Sie das Skript lokal ausführen, sollten Sie die Ergebnisse von Schritt 1, 3 und 4 ausgegeben bekommen und somit wissen, welche Tests ausgeführt werden. Das Skript erstellt außerdem eine Datei namens `test_list.txt`, die die Liste der auszuführenden Tests enthält, die Sie mit dem folgenden Befehl lokal ausführen können:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+Für den Fall, dass Ihnen etwas entgangen ist, wird die komplette Testreihe ebenfalls täglich ausgeführt.
+
+## Dokumentation erstellen
+
+Der Job `build_pr_documentation` erstellt und generiert eine Vorschau der Dokumentation, um sicherzustellen, dass alles in Ordnung ist, wenn Ihr PR zusammengeführt wird. Ein Bot fügt einen Link zur Vorschau der Dokumentation zu Ihrem PR hinzu. Alle Änderungen, die Sie an dem PR vornehmen, werden automatisch in der Vorschau aktualisiert. Wenn die Dokumentation nicht erstellt werden kann, klicken Sie auf **Details** neben dem fehlgeschlagenen Auftrag, um zu sehen, wo der Fehler liegt. Oft ist der Fehler so einfach wie eine fehlende Datei im `toctree`.
+
+Wenn Sie daran interessiert sind, die Dokumentation lokal zu erstellen oder in der Vorschau anzusehen, werfen Sie einen Blick in die [`README.md`](https://github.com/huggingface/transformers/tree/main/docs) im Ordner docs.
+
+## Code und Dokumentationsstil
+
+Die Formatierung des Codes erfolgt für alle Quelldateien, die Beispiele und die Tests mit `black` und `ruff`. Wir haben auch ein benutzerdefiniertes Tool, das sich um die Formatierung von docstrings und `rst`-Dateien kümmert (`utils/style_doc.py`), sowie um die Reihenfolge der Lazy-Importe, die in den Transformers `__init__.py`-Dateien durchgeführt werden (`utils/custom_init_isort.py`). All dies können Sie starten, indem Sie Folgendes ausführen
+
+```bash
+make style
+```
+
+Das CI prüft, ob diese innerhalb der Prüfung `ci/circleci: check_code_quality` angewendet wurden. Es führt auch `ruff` aus, das einen grundlegenden Blick auf Ihren Code wirft und sich beschwert, wenn es eine undefinierte Variable findet oder eine, die nicht verwendet wird. Um diese Prüfung lokal auszuführen, verwenden Sie
+
+```bash
+make quality
+```
+
+Dies kann sehr viel Zeit in Anspruch nehmen. Um dasselbe nur für die Dateien zu tun, die Sie im aktuellen Zweig geändert haben, führen Sie
+
+```bash
+make fixup
+```
+
+Dieser letzte Befehl führt auch alle zusätzlichen Prüfungen für die Konsistenz des Repositorys durch. Schauen wir uns diese an.
+
+## Repository-Konsistenz
+
+Dies fasst alle Tests zusammen, die sicherstellen, dass Ihr PR das Repository in einem guten Zustand verlässt. Sie können diese Prüfung lokal durchführen, indem Sie Folgendes ausführen:
+
+```bash
+make repo-consistency
+```
+
+Dies überprüft, ob:
+
+- Alle zum Init hinzugefügten Objekte sind dokumentiert (ausgeführt von `utils/check_repo.py`)
+- Alle `__init__.py`-Dateien haben in ihren beiden Abschnitten den gleichen Inhalt (ausgeführt von `utils/check_inits.py`)
+- Der gesamte Code, der als Kopie eines anderen Moduls identifiziert wurde, stimmt mit dem Original überein (ausgeführt von `utils/check_copies.py`)
+- Alle Konfigurationsklassen haben mindestens einen gültigen Prüfpunkt, der in ihren Dokumentationen erwähnt wird (ausgeführt von `utils/check_config_docstrings.py`)
+- Alle Konfigurationsklassen enthalten nur Attribute, die in den entsprechenden Modellierungsdateien verwendet werden (ausgeführt von `utils/check_config_attributes.py`)
+- Die Übersetzungen der READMEs und der Index des Dokuments haben die gleiche Modellliste wie die Haupt-README (durchgeführt von `utils/check_copies.py`)
+- Die automatisch generierten Tabellen in der Dokumentation sind auf dem neuesten Stand (ausgeführt von `utils/check_table.py`)
+- Die Bibliothek verfügt über alle Objekte, auch wenn nicht alle optionalen Abhängigkeiten installiert sind (ausgeführt von `utils/check_dummies.py`)
+
+Sollte diese Prüfung fehlschlagen, müssen die ersten beiden Punkte manuell korrigiert werden, die letzten vier können automatisch für Sie korrigiert werden, indem Sie den Befehl
+
+```bash
+make fix-copies
+```
+
+Zusätzliche Prüfungen betreffen PRs, die neue Modelle hinzufügen, vor allem, dass:
+
+- Alle hinzugefügten Modelle befinden sich in einer Auto-Zuordnung (durchgeführt von `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- Alle Modelle werden ordnungsgemäß getestet (ausgeführt von `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the main doc
+- All checkpoints used actually exist on the Hub
+
+-->
+
+### Kopien prüfen
+
+Da die Transformers-Bibliothek in Bezug auf den Modellcode sehr eigenwillig ist und jedes Modell vollständig in einer einzigen Datei implementiert sein sollte, ohne sich auf andere Modelle zu stützen, haben wir einen Mechanismus hinzugefügt, der überprüft, ob eine Kopie des Codes einer Ebene eines bestimmten Modells mit dem Original übereinstimmt. Auf diese Weise können wir bei einer Fehlerbehebung alle anderen betroffenen Modelle sehen und entscheiden, ob wir die Änderung weitergeben oder die Kopie zerstören.
+
+<Tip>
+
+Wenn eine Datei eine vollständige Kopie einer anderen Datei ist, sollten Sie sie in der Konstante `FULL_COPIES` von `utils/check_copies.py` registrieren.
+
+</Tip>
+
+Dieser Mechanismus stützt sich auf Kommentare der Form `# Kopiert von xxx`. Das `xxx` sollte den gesamten Pfad zu der Klasse der Funktion enthalten, die darunter kopiert wird. Zum Beispiel ist `RobertaSelfOutput` eine direkte Kopie der Klasse `BertSelfOutput`. Sie können also [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L289) sehen, dass sie einen Kommentar hat:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+```
+
+Beachten Sie, dass Sie dies nicht auf eine ganze Klasse anwenden, sondern auf die entsprechenden Methoden, von denen kopiert wird. Zum Beispiel [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L598) können Sie sehen, wie `RobertaPreTrainedModel._init_weights` von der gleichen Methode in `BertPreTrainedModel` mit dem Kommentar kopiert wird:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+```
+
+Manchmal ist die Kopie bis auf die Namen genau gleich: zum Beispiel verwenden wir in `RobertaAttention` `RobertaSelfAttention` anstelle von `BertSelfAttention`, aber ansonsten ist der Code genau derselbe. Aus diesem Grund unterstützt `#Copied from` einfache String-Ersetzungen mit der folgenden Syntax: `Kopiert von xxx mit foo->bar`. Das bedeutet, dass der Code kopiert wird, wobei alle Instanzen von "foo" durch "bar" ersetzt werden. Sie können sehen, wie es [hier](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` mit dem Kommentar verwendet wird:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+```
+
+Beachten Sie, dass um den Pfeil herum keine Leerzeichen stehen sollten (es sei denn, das Leerzeichen ist Teil des zu ersetzenden Musters, natürlich).
+
+Sie können mehrere Muster durch ein Komma getrennt hinzufügen. Zum Beispiel ist hier `CamemberForMaskedLM` eine direkte Kopie von `RobertaForMaskedLM` mit zwei Ersetzungen: `Roberta` zu `Camembert` und `ROBERTA` zu `CAMEMBERT`. Sie können [hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/camembert/modeling_camembert.py#L929) sehen, wie dies mit dem Kommentar gemacht wird:
+
+```py
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+```
+
+Wenn die Reihenfolge eine Rolle spielt (weil eine der Ersetzungen mit einer vorherigen in Konflikt geraten könnte), werden die Ersetzungen von links nach rechts ausgeführt.
+
+<Tip>
+
+Wenn die Ersetzungen die Formatierung ändern (wenn Sie z.B. einen kurzen Namen durch einen sehr langen Namen ersetzen), wird die Kopie nach Anwendung des automatischen Formats überprüft.
+
+</Tip>
+
+Eine andere Möglichkeit, wenn es sich bei den Mustern nur um verschiedene Umschreibungen derselben Ersetzung handelt (mit einer groß- und einer kleingeschriebenen Variante), besteht darin, die Option `all-casing` hinzuzufügen. [Hier](https://github.com/huggingface/transformers/blob/15082a9dc6950ecae63a0d3e5060b2fc7f15050a/src/transformers/models/mobilebert/modeling_mobilebert.py#L1237) ist ein Beispiel in `MobileBertForSequenceClassification` mit dem Kommentar:
+
+```py
+# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing
+```
+
+In diesem Fall wird der Code von `BertForSequenceClassification` kopiert, indem er ersetzt wird:
+- `Bert` durch `MobileBert` (zum Beispiel bei der Verwendung von `MobileBertModel` in der Init)
+- `bert` durch `mobilebert` (zum Beispiel bei der Definition von `self.mobilebert`)
+- `BERT` durch `MOBILEBERT` (in der Konstante `MOBILEBERT_INPUTS_DOCSTRING`)
diff --git a/docs/transformers/docs/source/de/preprocessing.md b/docs/transformers/docs/source/de/preprocessing.md
new file mode 100644
index 0000000000000000000000000000000000000000..b56a5c0ae4ca1cf64137d6331d10d20cae698769
--- /dev/null
+++ b/docs/transformers/docs/source/de/preprocessing.md
@@ -0,0 +1,506 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Vorverarbeiten
+
+[[open-in-colab]]
+
+Bevor Sie Ihre Daten in einem Modell verwenden können, müssen die Daten in ein für das Modell akzeptables Format gebracht werden. Ein Modell versteht keine Rohtexte, Bilder oder Audiodaten. Diese Eingaben müssen in Zahlen umgewandelt und zu Tensoren zusammengesetzt werden. In dieser Anleitung werden Sie:
+
+* Textdaten mit einem Tokenizer vorverarbeiten.
+* Bild- oder Audiodaten mit einem Feature Extractor vorverarbeiten.
+* Daten für eine multimodale Aufgabe mit einem Prozessor vorverarbeiten.
+
+## NLP
+
+<Youtube id="Yffk5aydLzg"/>
+
+Das wichtigste Werkzeug zur Verarbeitung von Textdaten ist ein [Tokenizer](main_classes/tokenizer). Ein Tokenizer zerlegt Text zunächst nach einer Reihe von Regeln in *Token*. Die Token werden in Zahlen umgewandelt, die zum Aufbau von Tensoren als Eingabe für ein Modell verwendet werden. Alle zusätzlichen Eingaben, die ein Modell benötigt, werden ebenfalls vom Tokenizer hinzugefügt.
+
+<Tip>
+
+Wenn Sie ein vortrainiertes Modell verwenden möchten, ist es wichtig, den zugehörigen vortrainierten Tokenizer zu verwenden. Dadurch wird sichergestellt, dass der Text auf die gleiche Weise aufgeteilt wird wie das Pretraining-Korpus und die gleichen entsprechenden Token-zu-Index (in der Regel als *vocab* bezeichnet) während des Pretrainings verwendet werden.
+
+</Tip>
+
+Laden Sie einen vortrainierten Tokenizer mit der Klasse [AutoTokenizer], um schnell loszulegen. Damit wird das *vocab* heruntergeladen, das verwendet wird, wenn ein Modell vortrainiert wird.
+
+### Tokenize
+
+Laden Sie einen vortrainierten Tokenizer mit [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+```
+
+Dann übergeben Sie Ihren Satz an den Tokenizer:
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Der Tokenizer gibt ein Wörterbuch mit drei wichtigen Elementen zurück:
+
+* [input_ids](glossary#input-ids) sind die Indizes, die den einzelnen Token im Satz entsprechen.
+* [attention_mask](glossary#attention-mask) gibt an, ob ein Token beachtet werden soll oder nicht.
+* [token_type_ids](glossary#token-type-ids) gibt an, zu welcher Sequenz ein Token gehört, wenn es mehr als eine Sequenz gibt.
+
+Sie können die `input_ids` dekodieren, um die ursprüngliche Eingabe zurückzugeben:
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+Wie Sie sehen können, hat der Tokenisierer zwei spezielle Token - `CLS` und `SEP` (Klassifikator und Separator) - zum Satz hinzugefügt. Nicht alle Modelle benötigen
+spezielle Token, aber wenn dies der Fall ist, fügt der Tokenisierer sie automatisch für Sie hinzu.
+
+Wenn Sie mehrere Sätze verarbeiten wollen, übergeben Sie die Sätze als Liste an den Tokenizer:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### Pad
+
+Dies bringt uns zu einem wichtigen Thema. Wenn Sie einen Haufen von Sätzen verarbeiten, sind diese nicht immer gleich lang. Das ist ein Problem, weil Tensoren, die Eingabe für das Modell, eine einheitliche Form haben müssen. Padding ist eine Strategie, die sicherstellt, dass Tensoren rechteckig sind, indem ein spezielles *Padding-Token* zu Sätzen mit weniger Token hinzugefügt wird.
+
+Setzen Sie den Parameter "padding" auf "true", um die kürzeren Sequenzen im Stapel so aufzufüllen, dass sie der längsten Sequenz entsprechen:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+Beachten Sie, dass der Tokenizer den ersten und den dritten Satz mit einer "0" aufgefüllt hat, weil sie kürzer sind!
+
+### Kürzung
+
+Auf der anderen Seite des Spektrums kann es vorkommen, dass eine Sequenz zu lang für ein Modell ist. In diesem Fall müssen Sie die Sequenz auf eine kürzere Länge kürzen.
+
+Setzen Sie den Parameter "truncation" auf "true", um eine Sequenz auf die vom Modell akzeptierte Höchstlänge zu kürzen:
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], 
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], 
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+### Tensoren erstellen
+
+Schließlich möchten Sie, dass der Tokenizer die tatsächlichen Tensoren zurückgibt, die dem Modell zugeführt werden.
+
+Setzen Sie den Parameter `return_tensors` entweder auf `pt` für PyTorch, oder `tf` für TensorFlow:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>, 
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## Audio
+
+Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bleibt dasselbe: numerische Sequenzen zu erstellen, die das Modell verstehen kann. Ein [feature extractor](main_classes/feature_extractor) dient dem ausdrücklichen Zweck, Merkmale aus Rohbild- oder Audiodaten zu extrahieren und in Tensoren zu konvertieren. Bevor Sie beginnen, installieren Sie 🤗 Datasets, um einen Audio-Datensatz zu laden, mit dem Sie experimentieren können:
+
+```bash
+pip install datasets
+```
+
+Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub)):
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Greifen Sie auf das erste Element der `audio`-Spalte zu, um einen Blick auf die Eingabe zu werfen. Durch den Aufruf der Spalte "audio" wird die Audiodatei automatisch geladen und neu gesampelt:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+Dies gibt drei Elemente zurück:
+
+* "array" ist das Sprachsignal, das als 1D-Array geladen - und möglicherweise neu gesampelt - wurde.
+* Pfad" zeigt auf den Speicherort der Audiodatei.
+* `sampling_rate` bezieht sich darauf, wie viele Datenpunkte im Sprachsignal pro Sekunde gemessen werden.
+
+### Resample
+
+Für dieses Tutorial werden Sie das Modell [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) verwenden. Wie Sie aus der Modellkarte ersehen können, ist das Wav2Vec2-Modell auf 16kHz abgetastetes Sprachaudio vortrainiert. Es ist wichtig, dass die Abtastrate Ihrer Audiodaten mit der Abtastrate des Datensatzes übereinstimmt, der für das Pre-Training des Modells verwendet wurde. Wenn die Abtastrate Ihrer Daten nicht dieselbe ist, müssen Sie Ihre Audiodaten neu abtasten. 
+
+Der Datensatz [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) hat zum Beispiel eine Abtastrate von 8000 kHz. Um das Wav2Vec2-Modell mit diesem Datensatz verwenden zu können, müssen Sie die Abtastrate auf 16 kHz erhöhen:
+
+```py
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+1. Verwenden Sie die Methode [`~datasets.Dataset.cast_column`] von 🤗 Datasets, um die Abtastrate auf 16kHz zu erhöhen:
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. Laden Sie die Audiodatei:
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+Wie Sie sehen können, ist die Abtastrate jetzt 16kHz!
+
+### Merkmalsextraktor
+
+Der nächste Schritt ist das Laden eines Merkmalsextraktors, um die Eingabe zu normalisieren und aufzufüllen. Beim Auffüllen von Textdaten wird für kürzere Sequenzen ein `0` hinzugefügt. Die gleiche Idee gilt für Audiodaten, und der Audio-Feature-Extraktor fügt eine `0` - interpretiert als Stille - zu `array` hinzu.
+
+Laden Sie den Merkmalsextraktor mit [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Übergeben Sie das Audio-"Array" an den Feature-Extraktor. Wir empfehlen auch, das Argument `sampling_rate` im Feature Extractor hinzuzufügen, um eventuell auftretende stille Fehler besser zu beheben.
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+### Auffüllen und Kürzen
+
+Genau wie beim Tokenizer können Sie variable Sequenzen in einem Stapel durch Auffüllen oder Abschneiden behandeln. Werfen Sie einen Blick auf die Sequenzlänge dieser beiden Audiobeispiele:
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+Wie Sie sehen können, hat das erste Beispiel eine längere Sequenz als das zweite Beispiel. Lassen Sie uns eine Funktion erstellen, die den Datensatz vorverarbeitet. Geben Sie eine maximale Länge der Probe an, und der Feature-Extraktor wird die Sequenzen entweder auffüllen oder abschneiden, damit sie dieser Länge entsprechen:
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+Wenden Sie die Funktion auf die ersten paar Beispiele im Datensatz an:
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+Schauen Sie sich nun noch einmal die verarbeiteten Beispiel-Längen an:
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+Die Länge der ersten beiden Beispiele entspricht nun der von Ihnen angegebenen Maximallänge.
+
+## Bildverarbeitung
+
+Ein Merkmalsextraktor wird auch verwendet, um Bilder für Bildverarbeitungsaufgaben zu verarbeiten. Auch hier besteht das Ziel darin, das Rohbild in eine Reihe von Tensoren als Eingabe zu konvertieren.
+
+Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für dieses Tutorial. Verwenden Sie den Parameter 🤗 Datasets `split`, um nur eine kleine Stichprobe aus dem Trainingssplit zu laden, da der Datensatz recht groß ist:
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) an:
+
+```py
+>>> dataset[0]["image"]
+```
+
+![vision-preprocess-tutorial.png](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png)
+
+### Merkmalsextraktor
+
+Laden Sie den Merkmalsextraktor mit [`AutoImageProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+### Datenerweiterung
+
+Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarbeitung eine Art von Datenerweiterung hinzuzufügen. Sie können Erweiterungen mit jeder beliebigen Bibliothek hinzufügen, aber in diesem Tutorial werden Sie das Modul [`transforms`](https://pytorch.org/vision/stable/transforms.html) von torchvision verwenden.
+
+1. Normalisieren Sie das Bild und verwenden Sie [`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html), um einige Transformationen - [`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html) und [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html) - miteinander zu verknüpfen:
+
+```py
+>>> from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> _transforms = Compose(
+...     [RandomResizedCrop(image_processor.size["height"]), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
+... )
+```
+
+2. Das Modell akzeptiert [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) als Eingabe. Dieser Wert wird vom Merkmalsextraktor erzeugt. Erstellen Sie eine Funktion, die `pixel_values` aus den Transformationen erzeugt:
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
+...     return examples
+```
+
+3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process#format-transform), um die Transformationen im laufenden Betrieb anzuwenden:
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. Wenn Sie nun auf das Bild zugreifen, werden Sie feststellen, dass der Feature Extractor die Modelleingabe "pixel_values" hinzugefügt hat:
+
+```py
+>>> dataset[0]["image"]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=384x512 at 0x7F1A7B0630D0>,
+ 'label': 6,
+ 'pixel_values': tensor([[[ 0.0353,  0.0745,  0.1216,  ..., -0.9922, -0.9922, -0.9922],
+          [-0.0196,  0.0667,  0.1294,  ..., -0.9765, -0.9843, -0.9922],
+          [ 0.0196,  0.0824,  0.1137,  ..., -0.9765, -0.9686, -0.8667],
+          ...,
+          [ 0.0275,  0.0745,  0.0510,  ..., -0.1137, -0.1216, -0.0824],
+          [ 0.0667,  0.0824,  0.0667,  ..., -0.0588, -0.0745, -0.0980],
+          [ 0.0353,  0.0353,  0.0431,  ..., -0.0039, -0.0039, -0.0588]],
+ 
+         [[ 0.2078,  0.2471,  0.2863,  ..., -0.9451, -0.9373, -0.9451],
+          [ 0.1608,  0.2471,  0.3098,  ..., -0.9373, -0.9451, -0.9373],
+          [ 0.2078,  0.2706,  0.3020,  ..., -0.9608, -0.9373, -0.8275],
+          ...,
+          [-0.0353,  0.0118, -0.0039,  ..., -0.2392, -0.2471, -0.2078],
+          [ 0.0196,  0.0353,  0.0196,  ..., -0.1843, -0.2000, -0.2235],
+          [-0.0118, -0.0039, -0.0039,  ..., -0.0980, -0.0980, -0.1529]],
+ 
+         [[ 0.3961,  0.4431,  0.4980,  ..., -0.9216, -0.9137, -0.9216],
+          [ 0.3569,  0.4510,  0.5216,  ..., -0.9059, -0.9137, -0.9137],
+          [ 0.4118,  0.4745,  0.5216,  ..., -0.9137, -0.8902, -0.7804],
+          ...,
+          [-0.2314, -0.1922, -0.2078,  ..., -0.4196, -0.4275, -0.3882],
+          [-0.1843, -0.1686, -0.2000,  ..., -0.3647, -0.3804, -0.4039],
+          [-0.1922, -0.1922, -0.1922,  ..., -0.2941, -0.2863, -0.3412]]])}
+```
+
+Hier sehen Sie, wie das Bild nach der Vorverarbeitung aussieht. Wie von den angewandten Transformationen zu erwarten, wurde das Bild willkürlich beschnitten und seine Farbeigenschaften sind anders.
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+![preprocessed_image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png)
+
+## Multimodal
+
+Für multimodale Aufgaben werden Sie eine Kombination aus allem, was Sie bisher gelernt haben, verwenden und Ihre Fähigkeiten auf eine Aufgabe der automatischen Spracherkennung (ASR) anwenden. Dies bedeutet, dass Sie einen:
+
+* Feature Extractor zur Vorverarbeitung der Audiodaten.
+* Tokenizer, um den Text zu verarbeiten.
+
+Kehren wir zum [LJ Speech](https://huggingface.co/datasets/lj_speech) Datensatz zurück:
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+Da Sie hauptsächlich an den Spalten "Audio" und "Text" interessiert sind, entfernen Sie die anderen Spalten:
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+Schauen Sie sich nun die Spalten "Audio" und "Text" an:
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Erinnern Sie sich an den früheren Abschnitt über die Verarbeitung von Audiodaten: Sie sollten immer die Abtastrate Ihrer Audiodaten [resample](preprocessing#audio), damit sie mit der Abtastrate des Datensatzes übereinstimmt, der für das Vortraining eines Modells verwendet wird:
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+### Prozessor
+
+Ein Processor kombiniert einen Feature-Extraktor und einen Tokenizer. Laden Sie einen Processor mit [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. Erstellen Sie eine Funktion, die die Audiodaten zu `input_values` verarbeitet und den Text zu `labels` tokenisiert. Dies sind Ihre Eingaben für das Modell:
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. Wenden Sie die Funktion "prepare_dataset" auf ein Beispiel an:
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+Beachten Sie, dass der Processor `input_values` und `labels` hinzugefügt hat. Auch die Abtastrate wurde korrekt auf 16kHz heruntergerechnet.
+
+Toll, Sie sollten jetzt in der Lage sein, Daten für jede Modalität vorzuverarbeiten und sogar verschiedene Modalitäten zu kombinieren! Im nächsten Kurs lernen Sie, wie Sie ein Modell mit Ihren neu aufbereiteten Daten feinabstimmen können.
diff --git a/docs/transformers/docs/source/de/quicktour.md b/docs/transformers/docs/source/de/quicktour.md
new file mode 100644
index 0000000000000000000000000000000000000000..856ba546b977693c261939ca8ac19296b764740c
--- /dev/null
+++ b/docs/transformers/docs/source/de/quicktour.md
@@ -0,0 +1,438 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Schnellstart
+
+[[open-in-colab]]
+
+Mit 🤗 Transformers können Sie sofort loslegen! Verwenden Sie die [`pipeline`] für schnelle Inferenz und laden Sie schnell ein vortrainiertes Modell und einen Tokenizer mit einer [AutoClass](./model_doc/auto), um Ihre Text-, Bild- oder Audioaufgabe zu lösen.
+
+<Tip>
+
+Alle in der Dokumentation vorgestellten Codebeispiele haben oben links einen Umschalter für PyTorch und TensorFlow. Wenn
+nicht, wird erwartet, dass der Code für beide Backends ohne Änderungen funktioniert.
+
+</Tip>
+
+## Pipeline
+
+[`pipeline`] ist der einfachste Weg, ein vortrainiertes Modell für eine bestimmte Aufgabe zu verwenden.
+
+<Youtube id="tiZFewofSLM"/>
+
+Die [`pipeline`] unterstützt viele gängige Aufgaben:
+
+**Text**:
+* Stimmungsanalyse: Klassifizierung der Polarität eines gegebenen Textes.
+* Textgenerierung (auf Englisch): Generierung von Text aus einer gegebenen Eingabe.
+* Name-Entity-Recognition (NER): Kennzeichnung jedes Worts mit der Entität, die es repräsentiert (Person, Datum, Ort usw.).
+* Beantwortung von Fragen: Extrahieren der Antwort aus dem Kontext, wenn ein gewisser Kontext und eine Frage gegeben sind.
+* Fill-mask: Ausfüllen von Lücken in einem Text mit maskierten Wörtern.
+* Zusammenfassung: Erstellung einer Zusammenfassung einer langen Text- oder Dokumentensequenz.
+* Übersetzung: Übersetzen eines Textes in eine andere Sprache.
+* Merkmalsextraktion: Erstellen einer Tensordarstellung des Textes.
+
+**Bild**:
+* Bildklassifizierung: Klassifizierung eines Bildes.
+* Bildsegmentierung: Klassifizierung jedes Pixels in einem Bild.
+* Objekterkennung: Erkennen von Objekten innerhalb eines Bildes.
+
+**Audio**:
+* Audioklassifizierung: Zuweisung eines Labels zu einem bestimmten Audiosegment.
+* Automatische Spracherkennung (ASR): Transkription von Audiodaten in Text.
+
+<Tip>
+
+Für mehr Details über die [`pipeline`] und assoziierte Aufgaben, schauen Sie in die Dokumentation [hier](./main_classes/pipelines).
+
+</Tip>
+
+### Verwendung der Pipeline
+
+Im folgenden Beispiel werden Sie die [`pipeline`] für die Stimmungsanalyse verwenden.
+
+Installieren Sie die folgenden Abhängigkeiten, falls Sie dies nicht bereits getan haben:
+
+<frameworkcontent>
+<pt>
+
+```bash
+pip install torch
+```
+</pt>
+<tf>
+
+```bash
+pip install tensorflow
+```
+</tf>
+</frameworkcontent>
+
+Importieren sie die [`pipeline`] und spezifizieren sie die Aufgabe, welche sie lösen möchten:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("sentiment-analysis")
+```
+
+Die Pipeline lädt ein standardmäßiges [vortrainiertes Modell](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) und einen Tokenizer für die Stimmungs-Analyse herunter und speichert sie. Jetzt können Sie den "Klassifikator" auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
+[{'label': 'POSITIVE', 'score': 0.9998}]
+```
+
+For more than one sentence, pass a list of sentences to the [`pipeline`] which returns a list of dictionaries:
+
+```py
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
+>>> for result in results:
+...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+label: POSITIVE, with score: 0.9998
+label: NEGATIVE, with score: 0.5309
+```
+
+Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek:
+
+```bash
+pip install datasets
+```
+
+Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten.
+
+```py
+>>> import torch
+>>> from transformers import pipeline
+
+>>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
+```
+
+Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
+```
+
+Wir müssen sicherstellen, dass die Abtastrate des Datensatzes der Abtastrate entspricht, mit der `facebook/wav2vec2-base-960h` trainiert wurde.
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
+```
+
+Audiodateien werden automatisch geladen und neu abgetastet, wenn die Spalte "audio" aufgerufen wird.
+Extrahieren wir die rohen Wellenform-Arrays der ersten 4 Beispiele und übergeben wir sie als Liste an die Pipeline:
+
+```py
+>>> result = speech_recognizer(dataset[:4]["audio"])
+>>> print([d["text"] for d in result])
+['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']
+```
+
+Bei einem größeren Datensatz mit vielen Eingaben (wie bei Sprache oder Bildverarbeitung) sollten Sie einen Generator anstelle einer Liste übergeben, der alle Eingaben in den Speicher lädt. Weitere Informationen finden Sie in der [Pipeline-Dokumentation](./main_classes/pipelines).
+
+### Ein anderes Modell und einen anderen Tokenizer in der Pipeline verwenden
+
+Die [`pipeline`] kann jedes Modell aus dem [Model Hub](https://huggingface.co/models) verwenden, wodurch es einfach ist, die [`pipeline`] für andere Anwendungsfälle anzupassen. Wenn Sie beispielsweise ein Modell wünschen, das französischen Text verarbeiten kann, verwenden Sie die Tags im Model Hub, um nach einem geeigneten Modell zu filtern. Das oberste gefilterte Ergebnis liefert ein mehrsprachiges [BERT-Modell](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment), das auf die Stimmungsanalyse abgestimmt ist. Großartig, verwenden wir dieses Modell!
+
+```py
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+```
+
+<frameworkcontent>
+<pt>
+Use the [`AutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `AutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</pt>
+<tf>
+Use the [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] to load the pretrained model and its associated tokenizer (more on an `TFAutoClass` below):
+
+```py
+>>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+</tf>
+</frameworkcontent>
+
+Dann können Sie das Modell und den Tokenizer in der [`pipeline`] angeben und den `Klassifikator` auf Ihren Zieltext anwenden:
+
+```py
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+>>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
+[{'label': '5 stars', 'score': 0.7273}]
+```
+
+Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein vortrainiertes Modell auf Ihren Daten feinabstimmen. Schauen Sie sich unser [Feinabstimmungs-Tutorial](./training) an, um zu erfahren, wie das geht. Und schließlich, nachdem Sie Ihr trainiertes Modell verfeinert haben, sollten Sie es mit der Community im Model Hub teilen (siehe Tutorial [hier](./model_sharing)), um NLP für alle zu demokratisieren! 🤗
+
+## AutoClass
+
+<Youtube id="AhChOFRegn4"/>
+
+Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen.
+
+Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren.
+
+### AutoTokenizer
+
+Ein Tokenizer ist für die Vorverarbeitung von Text in ein für das Modell verständliches Format zuständig. Zunächst zerlegt der Tokenisierer den Text in Wörter, die *Token* genannt werden. Es gibt mehrere Regeln für den Tokenisierungsprozess, z. B. wie und auf welcher Ebene ein Wort aufgespalten wird (weitere Informationen über Tokenisierung [hier](./tokenizer_summary)). Das Wichtigste ist jedoch, dass Sie den Tokenizer mit demselben Modellnamen instanziieren müssen, um sicherzustellen, dass Sie dieselben Tokenisierungsregeln verwenden, mit denen ein Modell zuvor trainiert wurde.
+Laden sie einen Tokenizer mit [`AutoTokenizer`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+```
+
+Anschließend wandelt der Tokenizer die Token in Zahlen um, um einen Tensor als Eingabe für das Modell zu konstruieren. Dieser wird als *Vokabular* des Modells bezeichnet.
+
+Übergeben Sie Ihren Text an den Tokenizer:
+
+```py
+>>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+>>> print(encoding)
+{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+Der Tokenizer gibt ein Wörterbuch zurück, das Folgendes enthält:
+
+* [input_ids](./glossary#input-ids): numerische Repräsentationen Ihrer Token.
+* [attention_mask](.glossary#attention-mask): gibt an, welche Token beachtet werden sollen.
+
+Genau wie die [`pipeline`] akzeptiert der Tokenizer eine Liste von Eingaben. Darüber hinaus kann der Tokenizer den Text auch auffüllen und kürzen, um einen Stapel mit einheitlicher Länge zurückzugeben:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> pt_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="pt",
+... )
+```
+</pt>
+<tf>
+
+```py
+>>> tf_batch = tokenizer(
+...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+...     padding=True,
+...     truncation=True,
+...     max_length=512,
+...     return_tensors="tf",
+... )
+```
+</tf>
+</frameworkcontent>
+
+Lesen Sie das Tutorial [preprocessing](./preprocessing) für weitere Details zur Tokenisierung.
+
+### AutoModel
+
+<frameworkcontent>
+<pt>
+🤗 Transformers bietet eine einfache und einheitliche Möglichkeit, vortrainierte Instanzen zu laden. Das bedeutet, dass Sie ein [`AutoModel`] laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`AutoModel`] für die Aufgabe. Da Sie eine Text- oder Sequenzklassifizierung vornehmen, laden Sie [`AutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben. Sie müssen nur das Wörterbuch entpacken, indem Sie `**` hinzufügen:
+
+```py
+>>> pt_outputs = pt_model(**pt_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+
+```py
+>>> from torch import nn
+
+>>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
+>>> print(pt_predictions)
+tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
+        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
+```
+</pt>
+<tf>
+🤗 Transformers bietet eine einfache und einheitliche Methode zum Laden von vortrainierten Instanzen. Das bedeutet, dass Sie ein [`TFAutoModel`] genauso laden können, wie Sie einen [`AutoTokenizer`] laden würden. Der einzige Unterschied ist die Auswahl des richtigen [`TFAutoModel`] für die Aufgabe. Da Sie Text - oder Sequenz - Klassifizierung machen, laden Sie [`TFAutoModelForSequenceClassification`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+```
+
+<Tip>
+
+In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klasse für welche Aufgabe zu verwenden ist.
+
+</Tip>
+
+Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben:
+
+```py
+>>> tf_outputs = tf_model(tf_batch)
+```
+
+Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
+
+```py
+>>> import tensorflow as tf
+
+>>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
+>>> tf_predictions  # doctest: +IGNORE_RESULT
+```
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+Alle 🤗 Transformers-Modelle (PyTorch oder TensorFlow) geben die Tensoren *vor* der endgültigen Aktivierungsfunktion
+Funktion (wie Softmax) aus, da die endgültige Aktivierungsfunktion oft mit dem Verlusten verschmolzen ist.
+
+</Tip>
+
+Modelle sind ein standardmäßiges [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) oder ein [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model), sodass Sie sie in Ihrer üblichen Trainingsschleife verwenden können. Um jedoch die Dinge einfacher zu machen, bietet 🤗 Transformers eine [`Trainer`]-Klasse für PyTorch, die Funktionalität für verteiltes Training, gemischte Präzision und mehr bietet. Für TensorFlow können Sie die Methode `fit` aus [Keras](https://keras.io/) verwenden. Siehe das [training tutorial](./training) für weitere Details.
+
+<Tip>
+
+Transformers-Modellausgaben sind spezielle Datenklassen, so dass ihre Attribute in einer IDE automatisch vervollständigt werden.
+Die Modellausgänge verhalten sich auch wie ein Tupel oder ein Wörterbuch (z.B. können Sie mit einem Integer, einem Slice oder einem String indexieren), wobei die Attribute, die "None" sind, ignoriert werden.
+
+</Tip>
+
+### Modell speichern
+
+<frameworkcontent>
+<pt>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer speichern, indem Sie [`PreTrainedModel.save_pretrained`] verwenden:
+
+```py
+>>> pt_save_directory = "./pt_save_pretrained"
+>>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
+>>> pt_model.save_pretrained(pt_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell erneut zu verwenden, laden Sie es mit [`PreTrainedModel.from_pretrained`]:
+
+```py
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
+```
+</pt>
+<tf>
+Sobald Ihr Modell feinabgestimmt ist, können Sie es mit seinem Tokenizer unter Verwendung von [`TFPreTrainedModel.save_pretrained`] speichern:
+
+```py
+>>> tf_save_directory = "./tf_save_pretrained"
+>>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
+>>> tf_model.save_pretrained(tf_save_directory)
+```
+
+Wenn Sie bereit sind, das Modell wieder zu verwenden, laden Sie es mit [`TFPreTrainedModel.from_pretrained`]:
+
+```py
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
+```
+</tf>
+</frameworkcontent>
+
+Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell zu speichern und es entweder als PyTorch- oder TensorFlow-Modell wieder zu laden. Der Parameter "from_pt" oder "from_tf" kann das Modell von einem Framework in das andere konvertieren:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+```
+</tf>
+</frameworkcontent>
+
+## Custom model builds
+
+Sie können die Konfigurationsklasse des Modells ändern, um zu bestimmen, wie ein Modell aufgebaut ist. Die Konfiguration legt die Attribute eines Modells fest, z. B. die Anzahl der verborgenen Schichten oder der Aufmerksamkeitsköpfe. Wenn Sie ein Modell aus einer benutzerdefinierten Konfigurationsklasse initialisieren, beginnen Sie bei Null. Die Modellattribute werden zufällig initialisiert, und Sie müssen das Modell trainieren, bevor Sie es verwenden können, um aussagekräftige Ergebnisse zu erhalten.
+
+Beginnen Sie mit dem Import von [`AutoConfig`] und laden Sie dann das trainierte Modell, das Sie ändern möchten. Innerhalb von [`AutoConfig.from_pretrained`] können Sie das Attribut angeben, das Sie ändern möchten, z. B. die Anzahl der Aufmerksamkeitsköpfe:
+
+```py
+>>> from transformers import AutoConfig
+
+>>> my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12)
+```
+
+<frameworkcontent>
+<pt>
+Create a model from your custom configuration with [`AutoModel.from_config`]:
+
+```py
+>>> from transformers import AutoModel
+
+>>> my_model = AutoModel.from_config(my_config)
+```
+</pt>
+<tf>
+Create a model from your custom configuration with [`TFAutoModel.from_config`]:
+
+```py
+>>> from transformers import TFAutoModel
+
+>>> my_model = TFAutoModel.from_config(my_config)
+```
+</tf>
+</frameworkcontent>
+
+Weitere Informationen zur Erstellung von benutzerdefinierten Konfigurationen finden Sie in der Anleitung [Erstellen einer benutzerdefinierten Architektur](./create_a_model).
+
+## Wie geht es weiter?
+
+Nachdem Sie nun die 🤗 Transformers-Kurztour abgeschlossen haben, schauen Sie sich unsere Anleitungen an und erfahren Sie, wie Sie spezifischere Dinge tun können, wie das Schreiben eines benutzerdefinierten Modells, die Feinabstimmung eines Modells für eine Aufgabe und wie man ein Modell mit einem Skript trainiert. Wenn Sie mehr über die Kernkonzepte von 🤗 Transformers erfahren möchten, nehmen Sie sich eine Tasse Kaffee und werfen Sie einen Blick auf unsere konzeptionellen Leitfäden!
diff --git a/docs/transformers/docs/source/de/run_scripts.md b/docs/transformers/docs/source/de/run_scripts.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b62c73276e0cb75b9956e9055075d3c644e00f0
--- /dev/null
+++ b/docs/transformers/docs/source/de/run_scripts.md
@@ -0,0 +1,351 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainieren mit einem Skript
+
+Neben den 🤗 Transformers [notebooks](./notebooks) gibt es auch Beispielskripte, die zeigen, wie man ein Modell für eine Aufgabe mit [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch), [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow) oder [JAX/Flax](https://github.com/huggingface/transformers/tree/main/examples/flax) trainiert.
+
+Sie werden auch Skripte finden, die wir in unseren [Forschungsprojekten](https://github.com/huggingface/transformers-research-projects/) und [Legacy-Beispielen](https://github.com/huggingface/transformers/tree/main/examples/legacy) verwendet haben und die größtenteils von der Community stammen. Diese Skripte werden nicht aktiv gepflegt und erfordern eine bestimmte Version von 🤗 Transformers, die höchstwahrscheinlich nicht mit der neuesten Version der Bibliothek kompatibel ist.
+
+Es wird nicht erwartet, dass die Beispielskripte bei jedem Problem sofort funktionieren. Möglicherweise müssen Sie das Skript an das Problem anpassen, das Sie zu lösen versuchen. Um Ihnen dabei zu helfen, legen die meisten Skripte vollständig offen, wie die Daten vorverarbeitet werden, so dass Sie sie nach Bedarf für Ihren Anwendungsfall bearbeiten können.
+
+Für jede Funktion, die Sie in einem Beispielskript implementieren möchten, diskutieren Sie bitte im [Forum](https://discuss.huggingface.co/) oder in einem [issue](https://github.com/huggingface/transformers/issues), bevor Sie einen Pull Request einreichen. Wir freuen uns zwar über Fehlerkorrekturen, aber es ist unwahrscheinlich, dass wir einen Pull Request zusammenführen, der mehr Funktionalität auf Kosten der Lesbarkeit hinzufügt.
+
+Diese Anleitung zeigt Ihnen, wie Sie ein Beispiel für ein Trainingsskript zur Zusammenfassung in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) und [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) ausführen können. Sofern nicht anders angegeben, sollten alle Beispiele mit beiden Frameworks funktionieren.
+
+## Einrichtung
+
+Um die neueste Version der Beispielskripte erfolgreich auszuführen, **müssen Sie 🤗 Transformers aus dem Quellcode** in einer neuen virtuellen Umgebung installieren:
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+Für ältere Versionen der Beispielskripte klicken Sie auf die Umschalttaste unten:
+
+<details>
+  <summary>Beispiele für ältere Versionen von 🤗 Transformers</summary>
+	<ul>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.5.1/examples">v4.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.4.2/examples">v4.4.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.3.3/examples">v4.3.3</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.2.2/examples">v4.2.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.1.1/examples">v4.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v4.0.1/examples">v4.0.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.5.1/examples">v3.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.4.0/examples">v3.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.3.1/examples">v3.3.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.2.0/examples">v3.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.1.0/examples">v3.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v3.0.2/examples">v3.0.2</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.11.0/examples">v2.11.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.10.0/examples">v2.10.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.9.1/examples">v2.9.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.8.0/examples">v2.8.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.7.0/examples">v2.7.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.6.0/examples">v2.6.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.5.1/examples">v2.5.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.4.0/examples">v2.4.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.3.0/examples">v2.3.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.2.0/examples">v2.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.1.0/examples">v2.1.1</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v2.0.0/examples">v2.0.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.2.0/examples">v1.2.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.1.0/examples">v1.1.0</a></li>
+		<li><a href="https://github.com/huggingface/transformers/tree/v1.0.0/examples">v1.0.0</a></li>
+	</ul>
+</details>
+
+Dann stellen Sie Ihren aktuellen Klon von 🤗 Transformers auf eine bestimmte Version um, z.B. v3.5.1:
+
+```bash
+git checkout tags/v3.5.1
+```
+
+Nachdem Sie die richtige Bibliotheksversion eingerichtet haben, navigieren Sie zu dem Beispielordner Ihrer Wahl und installieren die beispielspezifischen Anforderungen:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Ein Skript ausführen
+
+<frameworkcontent>
+<pt>
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Dann nimmt das Skript eine Feinabstimmung eines Datensatzes mit dem [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) auf einer Architektur vor, die eine Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/google-t5/t5-small) auf dem Datensatz [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Das Beispielskript lädt einen Datensatz aus der 🤗 [Datasets](https://huggingface.co/docs/datasets/) Bibliothek herunter und verarbeitet ihn vor. Anschließend nimmt das Skript die Feinabstimmung eines Datensatzes mit Keras auf einer Architektur vor, die die Zusammenfassung unterstützt. Das folgende Beispiel zeigt, wie die Feinabstimmung von [T5-small](https://huggingface.co/google-t5/t5-small) auf dem [CNN/DailyMail](https://huggingface.co/datasets/cnn_dailymail) Datensatz durchgeführt wird. Das T5-Modell benötigt aufgrund der Art und Weise, wie es trainiert wurde, ein zusätzliches Argument `source_prefix`. Mit dieser Eingabeaufforderung weiß T5, dass es sich um eine Zusammenfassungsaufgabe handelt.
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Verteiltes Training und gemischte Präzision
+
+Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unterstützt verteiltes Training und gemischte Präzision, d.h. Sie können ihn auch in einem Skript verwenden. So aktivieren Sie diese beiden Funktionen:
+
+- Fügen Sie das Argument `fp16` hinzu, um gemischte Genauigkeit zu aktivieren.
+- Legen Sie die Anzahl der zu verwendenden GPUs mit dem Argument `nproc_per_node` fest.
+
+```bash
+torchrun \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+TensorFlow-Skripte verwenden eine [`MirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) für verteiltes Training, und Sie müssen dem Trainingsskript keine zusätzlichen Argumente hinzufügen. Das TensorFlow-Skript verwendet standardmäßig mehrere GPUs, wenn diese verfügbar sind.
+
+## Ein Skript auf einer TPU ausführen
+
+<frameworkcontent>
+<pt>
+Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. PyTorch unterstützt TPUs mit dem [XLA](https://www.tensorflow.org/xla) Deep Learning Compiler (siehe [hier](https://github.com/pytorch/xla/blob/master/README.md) für weitere Details). Um eine TPU zu verwenden, starten Sie das Skript `xla_spawn.py` und verwenden das Argument `num_cores`, um die Anzahl der TPU-Kerne festzulegen, die Sie verwenden möchten.
+
+```bash
+python xla_spawn.py --num_cores 8 \
+    summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+</pt>
+<tf>
+Tensor Processing Units (TPUs) sind speziell für die Beschleunigung der Leistung konzipiert. TensorFlow Skripte verwenden eine [`TPUStrategy`](https://www.tensorflow.org/guide/distributed_training#tpustrategy) für das Training auf TPUs. Um eine TPU zu verwenden, übergeben Sie den Namen der TPU-Ressource an das Argument `tpu`.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval
+```
+</tf>
+</frameworkcontent>
+
+## Führen Sie ein Skript mit 🤗 Accelerate aus.
+
+🤗 [Accelerate](https://huggingface.co/docs/accelerate) ist eine reine PyTorch-Bibliothek, die eine einheitliche Methode für das Training eines Modells auf verschiedenen Arten von Setups (nur CPU, mehrere GPUs, TPUs) bietet und dabei die vollständige Transparenz der PyTorch-Trainingsschleife beibehält. Stellen Sie sicher, dass Sie 🤗 Accelerate installiert haben, wenn Sie es nicht bereits haben:
+
+> Hinweis: Da Accelerate schnell weiterentwickelt wird, muss die Git-Version von Accelerate installiert sein, um die Skripte auszuführen.
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Anstelle des Skripts `run_summarization.py` müssen Sie das Skript `run_summarization_no_trainer.py` verwenden. Die von Accelerate unterstützten Skripte haben eine Datei `task_no_trainer.py` im Ordner. Beginnen Sie mit dem folgenden Befehl, um eine Konfigurationsdatei zu erstellen und zu speichern:
+
+```bash
+accelerate config
+```
+
+Testen Sie Ihre Einrichtung, um sicherzustellen, dass sie korrekt konfiguriert ist:
+
+```bash
+accelerate test
+```
+
+Jetzt sind Sie bereit, das Training zu starten:
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization
+```
+
+## Verwenden Sie einen benutzerdefinierten Datensatz
+
+Das Verdichtungsskript unterstützt benutzerdefinierte Datensätze, solange es sich um eine CSV- oder JSON-Line-Datei handelt. Wenn Sie Ihren eigenen Datensatz verwenden, müssen Sie mehrere zusätzliche Argumente angeben:
+
+- `train_file` und `validation_file` geben den Pfad zu Ihren Trainings- und Validierungsdateien an.
+- `text_column` ist der Eingabetext, der zusammengefasst werden soll.
+- Summary_column" ist der auszugebende Zieltext.
+
+Ein Zusammenfassungsskript, das einen benutzerdefinierten Datensatz verwendet, würde wie folgt aussehen:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate
+```
+
+## Testen Sie ein Skript
+
+Es ist oft eine gute Idee, Ihr Skript an einer kleineren Anzahl von Beispielen für Datensätze auszuführen, um sicherzustellen, dass alles wie erwartet funktioniert, bevor Sie sich auf einen ganzen Datensatz festlegen, dessen Fertigstellung Stunden dauern kann. Verwenden Sie die folgenden Argumente, um den Datensatz auf eine maximale Anzahl von Stichproben zu beschränken:
+
+- `max_train_samples`
+- `max_eval_samples`
+- `max_predict_samples`
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+Nicht alle Beispielskripte unterstützen das Argument `max_predict_samples`. Wenn Sie sich nicht sicher sind, ob Ihr Skript dieses Argument unterstützt, fügen Sie das Argument `-h` hinzu, um dies zu überprüfen:
+
+```bash
+examples/pytorch/summarization/run_summarization.py -h
+```
+
+## Training vom Kontrollpunkt fortsetzen
+
+Eine weitere hilfreiche Option, die Sie aktivieren können, ist die Wiederaufnahme des Trainings von einem früheren Kontrollpunkt aus. Auf diese Weise können Sie im Falle einer Unterbrechung Ihres Trainings dort weitermachen, wo Sie aufgehört haben, ohne von vorne beginnen zu müssen. Es gibt zwei Methoden, um das Training von einem Kontrollpunkt aus wieder aufzunehmen.
+
+Die erste Methode verwendet das Argument `output_dir previous_output_dir`, um das Training ab dem letzten in `output_dir` gespeicherten Kontrollpunkt wieder aufzunehmen. In diesem Fall sollten Sie `overwrite_output_dir` entfernen:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --output_dir previous_output_dir \
+    --predict_with_generate
+```
+
+Die zweite Methode verwendet das Argument `Resume_from_checkpoint path_to_specific_checkpoint`, um das Training ab einem bestimmten Checkpoint-Ordner wieder aufzunehmen.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate
+```
+
+## Teilen Sie Ihr Modell
+
+Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:
+
+```bash
+huggingface-cli login
+```
+
+Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
+
+Wenn Sie Ihrem Repository einen bestimmten Namen geben möchten, fügen Sie ihn mit dem Argument `push_to_hub_model_id` hinzu. Das Repository wird automatisch unter Ihrem Namensraum aufgeführt.
+
+Das folgende Beispiel zeigt, wie Sie ein Modell mit einem bestimmten Repository-Namen hochladen können:
+
+```bash
+python examples/pytorch/summarization/run_summarization.py
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
\ No newline at end of file
diff --git a/docs/transformers/docs/source/de/testing.md b/docs/transformers/docs/source/de/testing.md
new file mode 100644
index 0000000000000000000000000000000000000000..100151e58c3da7a8695615782f9f537d7d93b0e5
--- /dev/null
+++ b/docs/transformers/docs/source/de/testing.md
@@ -0,0 +1,1293 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Testen
+
+
+Werfen wir einen Blick darauf, wie 🤗 Transformers-Modelle getestet werden und wie Sie neue Tests schreiben und die vorhandenen verbessern können.
+
+Es gibt 2 Testsuiten im Repository:
+
+1. `tests` -- Tests für die allgemeine API
+2. `examples` -- Tests hauptsächlich für verschiedene Anwendungen, die nicht Teil der API sind
+
+## Wie Transformatoren getestet werden
+
+1. Sobald ein PR eingereicht wurde, wird er mit 9 CircleCi Jobs getestet. Jeder neue Commit zu diesem PR wird erneut getestet. Diese Aufträge
+   sind in dieser [Konfigurationsdatei](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml) definiert, so dass Sie bei Bedarf die gleiche Umgebung auf Ihrem Rechner reproduzieren können.
+   Umgebung auf Ihrem Rechner reproduzieren können.
+
+   Diese CI-Jobs führen keine `@slow`-Tests durch.
+
+2. Es gibt 3 Jobs, die von [github actions](https://github.com/huggingface/transformers/actions) ausgeführt werden:
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): prüft, ob die torch hub
+     Integration funktioniert.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): führt schnelle Tests auf der GPU nur bei Commits auf
+     `main`. Es wird nur ausgeführt, wenn ein Commit auf `main` den Code in einem der folgenden Ordner aktualisiert hat: `src`,
+     `tests`, `.github` (um zu verhindern, dass er auf hinzugefügten Modellkarten, Notebooks usw. läuft)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): führt normale und langsame Tests auf GPU in
+     `tests` und `examples`:
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   Die Ergebnisse können Sie [hier](https://github.com/huggingface/transformers/actions) sehen.
+
+
+
+## Tests ausführen
+
+
+
+
+
+### Auswahl der auszuführenden Tests
+
+In diesem Dokument wird ausführlich erläutert, wie Tests ausgeführt werden können. Wenn Sie nach der Lektüre noch mehr Details benötigen
+finden Sie diese [hier](https://docs.pytest.org/en/latest/usage.html).
+
+Hier sind einige der nützlichsten Möglichkeiten, Tests auszuführen.
+
+Alle ausführen:
+
+```console
+pytest
+```
+
+oder:
+
+```bash
+make test
+```
+
+Beachten Sie, dass Letzteres wie folgt definiert ist:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+was pytest anweist:
+
+- so viele Testprozesse laufen zu lassen, wie es CPU-Kerne gibt (was zu viele sein könnten, wenn Sie nicht über eine Menge RAM verfügen!)
+- sicherzustellen, dass alle Tests aus derselben Datei von demselben Testprozess ausgeführt werden
+- Erfassen Sie keine Ausgaben
+- im ausführlichen Modus laufen lassen
+
+
+
+### Abrufen der Liste aller Tests
+
+Alle Tests der Testsuite:
+
+```bash
+pytest --collect-only -q
+```
+
+Alle Tests einer bestimmten Testdatei:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### Führen Sie ein bestimmtes Testmodul aus
+
+Um ein einzelnes Testmodul auszuführen:
+
+```bash
+pytest tests/utils/test_logging.py
+```
+
+### Spezifische Tests ausführen
+
+Da unittest in den meisten Tests verwendet wird, müssen Sie, um bestimmte Untertests auszuführen, den Namen der unittest
+Klasse, die diese Tests enthält. Er könnte zum Beispiel lauten:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+Hier:
+
+- `tests/test_optimization.py` - die Datei mit den Tests
+- `OptimizationTest` - der Name der Klasse
+- `test_adam_w` - der Name der spezifischen Testfunktion
+
+Wenn die Datei mehrere Klassen enthält, können Sie auswählen, dass nur die Tests einer bestimmten Klasse ausgeführt werden sollen. Zum Beispiel:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+führt alle Tests innerhalb dieser Klasse aus.
+
+Wie bereits erwähnt, können Sie sehen, welche Tests in der Klasse `OptimizationTest` enthalten sind, indem Sie sie ausführen:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+Sie können Tests mit Hilfe von Schlüsselwortausdrücken ausführen.
+
+Um nur Tests auszuführen, deren Name `adam` enthält:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+Die logischen `und` und `oder` können verwendet werden, um anzugeben, ob alle Schlüsselwörter übereinstimmen sollen oder nur eines. `nicht` kann verwendet werden, um
+negieren.
+
+Um alle Tests auszuführen, außer denen, deren Name `adam` enthält:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+Und Sie können die beiden Muster in einem kombinieren:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+Um zum Beispiel sowohl `test_adafactor` als auch `test_adam_w` auszuführen, können Sie verwenden:
+
+```bash
+pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+```
+
+Beachten Sie, dass wir hier `oder` verwenden, da wir wollen, dass eines der Schlüsselwörter übereinstimmt, um beide einzuschließen.
+
+Wenn Sie nur Tests einschließen möchten, die beide Muster enthalten, müssen Sie `und` verwenden:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### Führen Sie `accelerate` Tests durch
+
+Manchmal müssen Sie `accelerate` Tests für Ihre Modelle ausführen. Dazu fügen Sie einfach `-m accelerate_tests` zu Ihrem Befehl hinzu, wenn Sie diese Tests bei einem `OPT`-Lauf ausführen möchten:
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
+```
+
+
+### Dokumentationstests ausführen
+
+Um zu testen, ob die Dokumentationsbeispiele korrekt sind, sollten Sie überprüfen, ob die `doctests` erfolgreich sind.
+Lassen Sie uns als Beispiel den docstring von [WhisperModel.forward](https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py#L1017-L1035) verwenden:
+
+```python
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+Führen Sie einfach die folgende Zeile aus, um automatisch jedes docstring-Beispiel in der gewünschten Datei zu testen:
+```bash
+pytest --doctest-modules <path_to_file_or_dir>
+```
+Wenn die Datei eine Markdown-Erweiterung hat, sollten Sie das Argument `--doctest-glob="*.md"` hinzufügen.
+
+### Nur geänderte Tests ausführen
+
+Mit [pytest-picked](https://github.com/anapaulagomes/pytest-picked) können Sie die Tests ausführen, die sich auf die unstaged Dateien oder den aktuellen Zweig (gemäß Git) beziehen. Auf diese Weise können Sie schnell testen, ob Ihre Änderungen nichts kaputt gemacht haben.
+nichts kaputt gemacht haben, da die Tests für Dateien, die Sie nicht verändert haben, nicht ausgeführt werden.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+Alle Tests werden von Dateien und Ordnern ausgeführt, die geändert, aber noch nicht übergeben wurden.
+
+### Fehlgeschlagene Tests bei Änderung der Quelle automatisch wiederholen
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) bietet eine sehr nützliche Funktion zur Erkennung aller fehlgeschlagenen
+Tests zu erkennen und dann darauf zu warten, dass Sie Dateien ändern, um die fehlgeschlagenen Tests so lange zu wiederholen, bis sie erfolgreich sind, während Sie die
+sie reparieren. So müssen Sie pytest nicht erneut starten, nachdem Sie die Korrektur vorgenommen haben. Dies wird so lange wiederholt, bis alle Tests bestanden sind.
+Danach wird erneut ein vollständiger Durchlauf durchgeführt.
+
+```bash
+pip install pytest-xdist
+```
+
+So rufen Sie den Modus auf: `pytest -f` oder `pytest --looponfail`
+
+Datei-Änderungen werden erkannt, indem die Wurzelverzeichnisse von `looponfailroots` und alle ihre Inhalte (rekursiv) untersucht werden.
+Wenn die Vorgabe für diesen Wert für Sie nicht funktioniert, können Sie ihn in Ihrem Projekt ändern, indem Sie eine Konfigurations
+Option in der Datei `setup.cfg` ändern:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+oder die Dateien `pytest.ini`/`tox.ini``:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+Dies würde dazu führen, dass nur nach Dateiänderungen in den jeweiligen Verzeichnissen gesucht wird, die relativ zum Verzeichnis der ini-Datei angegeben sind.
+Verzeichnis.
+
+[pytest-watch](https://github.com/joeyespo/pytest-watch) ist eine alternative Implementierung dieser Funktionalität.
+
+
+### Überspringen eines Testmoduls
+
+Wenn Sie alle Testmodule ausführen möchten, mit Ausnahme einiger weniger, können Sie diese ausschließen, indem Sie eine explizite Liste der auszuführenden Tests angeben. Für
+Beispiel: Um alle Tests außer `test_modeling_*.py` auszuführen:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### Status leeren
+
+CI-Builds und wenn Isolation wichtig ist (gegen Geschwindigkeit), sollte der Cache geleert werden:
+
+```bash
+pytest --cache-clear tests
+```
+
+### Tests parallel ausführen
+
+Wie bereits erwähnt, führt `make test` über das Plugin `pytest-xdist` Tests parallel aus (Argument `-n X`, z.B. `-n 2`
+um 2 Jobs parallel laufen zu lassen).
+
+Mit der Option `--dist=` von `pytest-xdist` können Sie steuern, wie die Tests gruppiert werden. Mit `--dist=loadfile` werden die
+Tests, die sich in einer Datei befinden, in denselben Prozess.
+
+Da die Reihenfolge der ausgeführten Tests unterschiedlich und nicht vorhersehbar ist, kann die Ausführung der Testsuite mit `pytest-xdist`
+zu Fehlern führt (was bedeutet, dass wir einige unentdeckte gekoppelte Tests haben), verwenden Sie [pytest-replay](https://github.com/ESSS/pytest-replay), um die Tests in der gleichen Reihenfolge abzuspielen, was dabei helfen sollte
+diese fehlgeschlagene Sequenz auf ein Minimum zu reduzieren.
+
+### Testreihenfolge und Wiederholung
+
+Es ist gut, die Tests mehrmals zu wiederholen, nacheinander, zufällig oder in Gruppen, um mögliche
+Abhängigkeiten und zustandsbezogene Fehler zu erkennen (Abriss). Und die einfache, mehrfache Wiederholung ist einfach gut, um
+einige Probleme zu erkennen, die durch die Zufälligkeit von DL aufgedeckt werden.
+
+
+#### Wiederholungstests
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+Und führen Sie dann jeden Test mehrmals durch (standardmäßig 50):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+Dieses Plugin funktioniert nicht mit dem `-n` Flag von `pytest-xdist`.
+
+</Tip>
+
+<Tip>
+
+Es gibt noch ein anderes Plugin `pytest-repeat`, aber es funktioniert nicht mit `unittest`.
+
+</Tip>
+
+#### Run tests in a random order
+
+```bash
+pip install pytest-random-order
+```
+
+Wichtig: Das Vorhandensein von `pytest-random-order` sorgt für eine automatische Zufallsanordnung der Tests, es sind keine Konfigurationsänderungen oder
+Befehlszeilenoptionen sind nicht erforderlich.
+
+Wie bereits erläutert, ermöglicht dies die Erkennung von gekoppelten Tests - bei denen der Zustand eines Tests den Zustand eines anderen beeinflusst. Wenn
+`pytest-random-order` installiert ist, gibt es den Zufallswert aus, der für diese Sitzung verwendet wurde, z.B:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+Wenn eine bestimmte Sequenz fehlschlägt, können Sie sie reproduzieren, indem Sie genau diesen Seed hinzufügen, z.B:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+Es wird nur dann die exakte Reihenfolge reproduzieren, wenn Sie genau dieselbe Liste von Tests (oder gar keine Liste) verwenden. Sobald Sie beginnen, die Liste
+die Liste manuell einzugrenzen, können Sie sich nicht mehr auf den Seed verlassen, sondern müssen die Tests manuell in der genauen Reihenfolge auflisten
+auflisten und pytest anweisen, sie nicht zu randomisieren, indem Sie `--random-order-bucket=none` verwenden, z.B.:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+So deaktivieren Sie das Shuffling für alle Tests:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+Standardmäßig ist `--random-order-bucket=module` impliziert, wodurch die Dateien auf den Modulebenen gemischt werden. Es kann auch
+auf den Ebenen `class`, `package`, `global` und `none` mischen. Die vollständigen Details entnehmen Sie bitte der
+[Dokumentation](https://github.com/jbasko/pytest-random-order).
+
+Eine weitere Alternative zur Randomisierung ist: [`pytest-random`](https://github.com/pytest-dev/pytest-randomly). Dieses
+Modul hat eine sehr ähnliche Funktionalität/Schnittstelle, aber es hat nicht die Eimermodi, die in
+`pytest-random-order` zur Verfügung. Es hat das gleiche Problem, dass es sich nach der Installation aufdrängt.
+
+### Variationen von Aussehen und Bedienung
+
+#### pytest-zucker
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar) ist ein Plugin, das das Erscheinungsbild verbessert, eine
+Fortschrittsbalken hinzufügt und Tests, die fehlschlagen, sowie die Bestätigung sofort anzeigt. Es wird bei der Installation automatisch aktiviert.
+
+```bash
+pip install pytest-sugar
+```
+
+Um Tests ohne sie durchzuführen, führen Sie aus:
+
+```bash
+pytest -p no:sugar
+```
+
+oder deinstallieren Sie es.
+
+
+
+#### Melden Sie den Namen jedes Subtests und seinen Fortschritt
+
+Für einen einzelnen oder eine Gruppe von Tests über `pytest` (nach `pip install pytest-pspec`):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### Zeigt fehlgeschlagene Tests sofort an
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) zeigt Fehlschläge und Fehler sofort an, anstatt
+bis zum Ende der Testsitzung zu warten.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### Zu GPU oder nicht zu GPU
+
+Bei einem GPU-aktivierten Setup fügen Sie zum Testen im reinen CPU-Modus `CUDA_VISIBLE_DEVICES=""` hinzu:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
+```
+
+oder wenn Sie mehrere Grafikprozessoren haben, können Sie angeben, welcher von `pytest` verwendet werden soll. Wenn Sie zum Beispiel nur den
+zweiten Grafikkarte zu verwenden, wenn Sie die Grafikkarten `0` und `1` haben, können Sie folgendes ausführen:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
+```
+
+Dies ist praktisch, wenn Sie verschiedene Aufgaben auf verschiedenen GPUs ausführen möchten.
+
+Einige Tests müssen nur auf der CPU ausgeführt werden, andere entweder auf der CPU, der GPU oder der TPU und wieder andere auf mehreren GPUs. Die folgenden skip
+Dekorateure werden verwendet, um die Anforderungen von Tests in Bezug auf CPU/GPU/TPU festzulegen:
+
+- `require_torch` - dieser Test wird nur unter Torch ausgeführt
+- `require_torch_gpu` - wie `require_torch` plus erfordert mindestens 1 GPU
+- `require_torch_multi_gpu` - wie `require_torch` und zusätzlich mindestens 2 GPUs erforderlich
+- `require_torch_non_multi_gpu` - wie `require_torch` plus benötigt 0 oder 1 GPUs
+- `require_torch_up_to_2_gpus` - wie `require_torch` plus erfordert 0 oder 1 oder 2 GPUs
+- `require_torch_xla` - wie `require_torch` plus erfordert mindestens 1 TPU
+
+Lassen Sie uns die GPU-Anforderungen in der folgenden Tabelle darstellen:
+
+
+| n gpus | decorator                      |
+|--------|--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder mehr GPUs verfügbar sind und pytorch installiert ist:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
+wie Sie ihn einrichten können:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+Einige Dekoratoren wie `@parametrized` schreiben Testnamen um, daher müssen `@require_*`-Sprungdekoratoren als letztes aufgeführt werden.
+zuletzt aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+Dieses Problem mit der Reihenfolge gibt es bei `@pytest.mark.parametrize` nicht, Sie können es an den Anfang oder an den Schluss setzen und es wird trotzdem funktionieren.
+funktionieren. Aber es funktioniert nur bei Nicht-Unittests.
+
+Innerhalb von Tests:
+
+- Wie viele GPUs sind verfügbar:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  # works with torch and tf
+```
+
+### Testen mit einem bestimmten PyTorch-Backend oder Gerät
+
+Um die Testsuite auf einem bestimmten Torch-Gerät auszuführen, fügen Sie `TRANSFORMERS_TEST_DEVICE="$Gerät"` hinzu, wobei `$Gerät` das Ziel-Backend ist. Zum Beispiel, um nur auf der CPU zu testen:
+```bash
+TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
+```
+
+Diese Variable ist nützlich, um benutzerdefinierte oder weniger verbreitete PyTorch-Backends wie `mps` zu testen. Sie kann auch verwendet werden, um den gleichen Effekt wie `CUDA_VISIBLE_DEVICES` zu erzielen, indem Sie bestimmte GPUs anvisieren oder im reinen CPU-Modus testen.
+
+Bestimmte Geräte erfordern einen zusätzlichen Import, nachdem Sie `torch` zum ersten Mal importiert haben. Dies kann über die Umgebungsvariable `TRANSFORMERS_TEST_BACKEND` festgelegt werden:
+```bash
+TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
+```
+
+
+### Verteiltes Training
+
+`pytest` kann nicht direkt mit verteiltem Training umgehen. Wenn dies versucht wird, tun die Unterprozesse nicht das Richtige
+und denken am Ende, sie seien `pytest` und beginnen, die Testsuite in Schleifen auszuführen. Es funktioniert jedoch, wenn man
+einen normalen Prozess erzeugt, der dann mehrere Worker erzeugt und die IO-Pipes verwaltet.
+
+Hier sind einige Tests, die dies verwenden:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+Um direkt mit der Ausführung zu beginnen, suchen Sie in diesen Tests nach dem Aufruf `execute_subprocess_async`.
+
+Sie benötigen mindestens 2 GPUs, um diese Tests in Aktion zu sehen:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### Erfassung von Ausgaben
+
+Während der Testausführung werden alle Ausgaben, die an `stdout` und `stderr` gesendet werden, aufgezeichnet. Wenn ein Test oder eine Setup-Methode fehlschlägt, wird die
+wird die entsprechende aufgezeichnete Ausgabe in der Regel zusammen mit dem Fehler-Traceback angezeigt.
+
+Um die Aufzeichnung von Ausgaben zu deaktivieren und `stdout` und `stderr` normal zu erhalten, verwenden Sie `-s` oder `--capture=no`:
+
+```bash
+pytest -s tests/utils/test_logging.py
+```
+
+So senden Sie Testergebnisse an die JUnit-Formatausgabe:
+
+```bash
+py.test tests --junitxml=result.xml
+```
+
+### Farbsteuerung
+
+Keine Farbe zu haben (z.B. gelb auf weißem Hintergrund ist nicht lesbar):
+
+```bash
+pytest --color=no tests/utils/test_logging.py
+```
+
+### Testbericht an den Online-Dienst pastebin senden
+
+Erstellen Sie eine URL für jeden Testfehler:
+
+```bash
+pytest --pastebin=failed tests/utils/test_logging.py
+```
+
+Dadurch werden Informationen über den Testlauf an einen entfernten Paste-Dienst übermittelt und eine URL für jeden Fehlschlag bereitgestellt. Sie können die
+Tests wie gewohnt auswählen oder z.B. -x hinzufügen, wenn Sie nur einen bestimmten Fehler senden möchten.
+
+Erstellen einer URL für ein ganzes Testsitzungsprotokoll:
+
+```bash
+pytest --pastebin=all tests/utils/test_logging.py
+```
+
+## Tests schreiben
+
+🤗 Die Tests von Transformers basieren auf `unittest`, werden aber von `pytest` ausgeführt, so dass die meiste Zeit Funktionen aus beiden Systemen
+verwendet werden können.
+
+Sie können [hier](https://docs.pytest.org/en/stable/unittest.html) nachlesen, welche Funktionen unterstützt werden, aber das Wichtigste ist
+Wichtig ist, dass die meisten `pytest`-Fixtures nicht funktionieren. Auch die Parametrisierung nicht, aber wir verwenden das Modul
+`parametrisiert`, das auf ähnliche Weise funktioniert.
+
+
+### Parametrisierung
+
+Oft besteht die Notwendigkeit, denselben Test mehrmals auszuführen, aber mit unterschiedlichen Argumenten. Das könnte innerhalb des Tests geschehen
+des Tests gemacht werden, aber dann gibt es keine Möglichkeit, den Test mit nur einem Satz von Argumenten auszuführen.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+Nun wird dieser Test standardmäßig 3 Mal ausgeführt, wobei jedes Mal die letzten 3 Argumente von `test_floor` den entsprechenden Argumenten in der Parameterliste zugeordnet werden.
+die entsprechenden Argumente in der Parameterliste.
+
+Sie können auch nur die Parameter `negativ` und `ganzzahlig` mit ausführen:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+oder alle Untertests außer `negativ`, mit:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+Neben der Verwendung des gerade erwähnten Filters `-k` können Sie auch den genauen Namen jedes Untertests herausfinden und jeden
+oder alle unter Verwendung ihrer genauen Namen ausführen.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+und es wird aufgelistet:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+Jetzt können Sie also nur 2 spezifische Untertests durchführen:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+Das Modul [parametrisiert](https://pypi.org/project/parameterized/), das sich bereits in den Entwickler-Abhängigkeiten befindet
+von `transformers` befindet, funktioniert sowohl für `unittests` als auch für `pytest` Tests.
+
+Wenn es sich bei dem Test jedoch nicht um einen `Unittest` handelt, können Sie `pytest.mark.parametrize` verwenden (oder Sie können sehen, dass es in
+einigen bestehenden Tests verwendet wird, meist unter `Beispiele`).
+
+Hier ist das gleiche Beispiel, diesmal unter Verwendung der `parametrize`-Markierung von `pytest`:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+Genau wie bei `parametrisiert` können Sie mit `pytest.mark.parametrize` genau steuern, welche Subtests ausgeführt werden
+ausgeführt werden, wenn der Filter `-k` nicht ausreicht. Allerdings erzeugt diese Parametrisierungsfunktion einen etwas anderen Satz von
+Namen für die Untertests. Sie sehen folgendermaßen aus:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+und es wird aufgelistet:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+Jetzt können Sie also nur den spezifischen Test durchführen:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+wie im vorherigen Beispiel.
+
+
+
+### Dateien und Verzeichnisse
+
+In Tests müssen wir oft wissen, wo sich Dinge relativ zur aktuellen Testdatei befinden, und das ist nicht trivial, da der Test
+von mehreren Verzeichnissen aus aufgerufen werden kann oder sich in Unterverzeichnissen mit unterschiedlicher Tiefe befinden kann. Eine Hilfsklasse
+`transformers.test_utils.TestCasePlus` löst dieses Problem, indem sie alle grundlegenden Pfade sortiert und einfache
+Zugriffsmöglichkeiten auf sie bietet:
+
+- `pathlib`-Objekte (alle vollständig aufgelöst):
+
+  - `test_file_path` - der aktuelle Testdateipfad, d.h. `__file__`
+  - `test_file_dir` - das Verzeichnis, das die aktuelle Testdatei enthält
+  - `tests_dir` - das Verzeichnis der `tests` Testreihe
+  - `examples_dir` - das Verzeichnis der `examples` Test-Suite
+  - `repo_root_dir` - das Verzeichnis des Repositorys
+  - `src_dir` - das Verzeichnis von `src` (d.h. wo sich das Unterverzeichnis `transformers` befindet)
+
+- stringifizierte Pfade - wie oben, aber diese geben Pfade als Strings zurück, anstatt als `pathlib`-Objekte:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+Um diese zu verwenden, müssen Sie lediglich sicherstellen, dass der Test in einer Unterklasse von
+`transformers.test_utils.TestCasePlus` befindet. Zum Beispiel:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+Wenn Sie Pfade nicht über `pathlib` manipulieren müssen oder nur einen Pfad als String benötigen, können Sie jederzeit
+`str()` auf das `pathlib`-Objekt anwenden oder die Accessoren mit der Endung `_str` verwenden. Zum Beispiel:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### Temporäre Dateien und Verzeichnisse
+
+Die Verwendung eindeutiger temporärer Dateien und Verzeichnisse ist für die parallele Durchführung von Tests unerlässlich, damit sich die Tests nicht gegenseitig überschreiben.
+Daten gegenseitig überschreiben. Außerdem möchten wir, dass die temporären Dateien und Verzeichnisse am Ende jedes Tests, der sie erstellt hat, gelöscht werden.
+erstellt hat. Daher ist die Verwendung von Paketen wie `tempfile`, die diese Anforderungen erfüllen, unerlässlich.
+
+Beim Debuggen von Tests müssen Sie jedoch sehen können, was in der temporären Datei oder dem temporären Verzeichnis gespeichert wird und Sie möchten
+Sie müssen den genauen Pfad kennen und dürfen ihn nicht bei jedem neuen Testdurchlauf zufällig ändern.
+
+Für solche Zwecke ist die Hilfsklasse `transformers.test_utils.TestCasePlus` am besten geeignet. Sie ist eine Unterklasse von
+Unittest.TestCase`, so dass wir in den Testmodulen einfach von ihr erben können.
+
+Hier ist ein Beispiel für die Verwendung dieser Klasse:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+Dieser Code erstellt ein eindeutiges temporäres Verzeichnis und setzt `tmp_dir` auf dessen Speicherort.
+
+- Erstellen Sie ein eindeutiges temporäres Verzeichnis:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+tmp_dir" enthält den Pfad zu dem erstellten temporären Verzeichnis. Es wird am Ende des Tests automatisch entfernt.
+Tests entfernt.
+
+- Erstellen Sie ein temporäres Verzeichnis meiner Wahl, stellen Sie sicher, dass es leer ist, bevor der Test beginnt, und leeren Sie es nach dem Test nicht.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+Dies ist nützlich für die Fehlersuche, wenn Sie ein bestimmtes Verzeichnis überwachen und sicherstellen möchten, dass die vorherigen Tests keine Daten darin hinterlassen haben.
+keine Daten dort hinterlassen haben.
+
+- Sie können das Standardverhalten außer Kraft setzen, indem Sie die Argumente `before` und `after` direkt überschreiben, was zu einem der folgenden Verhaltensweisen führt
+  folgenden Verhaltensweisen:
+
+  - `before=True`: das temporäre Verzeichnis wird immer zu Beginn des Tests gelöscht.
+  - `before=False`: wenn das temporäre Verzeichnis bereits existiert, bleiben alle vorhandenen Dateien dort erhalten.
+  - `after=True`: das temporäre Verzeichnis wird immer am Ende des Tests gelöscht.
+  - `after=False`: das temporäre Verzeichnis wird am Ende des Tests immer beibehalten.
+
+<Tip>
+
+Um das Äquivalent von `rm -r` sicher ausführen zu können, sind nur Unterverzeichnisse des Projektarchivs checkout erlaubt, wenn
+ein explizites `tmp_dir` verwendet wird, so dass nicht versehentlich ein `/tmp` oder ein ähnlich wichtiger Teil des Dateisystems vernichtet wird.
+d.h. geben Sie bitte immer Pfade an, die mit `./` beginnen.
+
+</Tip>
+
+<Tip>
+
+Jeder Test kann mehrere temporäre Verzeichnisse registrieren, die alle automatisch entfernt werden, sofern nicht anders gewünscht.
+anders.
+
+</Tip>
+
+### Temporäre Überschreibung von sys.path
+
+Wenn Sie `sys.path` vorübergehend überschreiben müssen, um z.B. von einem anderen Test zu importieren, können Sie den
+Kontextmanager `ExtendSysPath` verwenden. Beispiel:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### Überspringen von Tests
+
+Dies ist nützlich, wenn ein Fehler gefunden und ein neuer Test geschrieben wird, der Fehler aber noch nicht behoben ist. Damit wir ihn
+in das Haupt-Repository zu übertragen, müssen wir sicherstellen, dass er bei `make test` übersprungen wird.
+
+Methoden:
+
+- Ein **Skip** bedeutet, dass Sie erwarten, dass Ihr Test nur dann erfolgreich ist, wenn einige Bedingungen erfüllt sind, andernfalls sollte pytest den Test überspringen.
+  die Ausführung des Tests ganz überspringen. Übliche Beispiele sind das Überspringen von Tests, die nur unter Windows laufen, auf Nicht-Windows-Plattformen oder das Überspringen von
+  Tests, die von einer externen Ressource abhängen, die im Moment nicht verfügbar ist (z.B. eine Datenbank).
+
+- Ein **xfail** bedeutet, dass Sie erwarten, dass ein Test aus irgendeinem Grund fehlschlägt. Ein gängiges Beispiel ist ein Test für eine Funktion, die noch nicht
+  noch nicht implementiert oder ein noch nicht behobener Fehler. Wenn ein Test trotz eines erwarteten Fehlschlags bestanden wird (markiert mit
+  pytest.mark.xfail), ist dies ein xpass und wird in der Testzusammenfassung gemeldet.
+
+Einer der wichtigsten Unterschiede zwischen den beiden ist, dass `skip` den Test nicht ausführt, während `xfail` dies tut. Wenn also der
+Code, der fehlerhaft ist, einen schlechten Zustand verursacht, der sich auf andere Tests auswirkt, sollten Sie also nicht `xfail` verwenden.
+
+#### Implementierung
+
+- Hier sehen Sie, wie Sie einen ganzen Test bedingungslos überspringen können:
+
+```python no-style
+@unittest.skip(reason="this bug needs to be fixed")
+def test_feature_x():
+```
+
+oder mit pytest:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+oder mit dem `xfail` Weg:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+- Hier erfahren Sie, wie Sie einen Test aufgrund einer internen Prüfung innerhalb des Tests auslassen können:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+oder das ganze Modul:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+oder mit dem `xfail` Weg:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- Hier erfahren Sie, wie Sie alle Tests in einem Modul überspringen können, wenn ein Import fehlt:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+- Einen Test aufgrund einer Bedingung überspringen:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+oder:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+oder überspringen Sie das ganze Modul:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+Weitere Details, Beispiele und Möglichkeiten finden Sie [hier](https://docs.pytest.org/en/latest/skipping.html).
+
+### Langsame Tests
+
+Die Bibliothek der Tests wächst ständig, und einige der Tests brauchen Minuten, um ausgeführt zu werden, daher können wir es uns nicht leisten, eine Stunde zu warten, bis die
+eine Stunde auf die Fertigstellung der Testsuite auf CI zu warten. Daher sollten langsame Tests, mit einigen Ausnahmen für wichtige Tests, wie im folgenden Beispiel
+wie im folgenden Beispiel markiert werden:
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+Sobald ein Test als `@slow` markiert ist, setzen Sie die Umgebungsvariable `RUN_SLOW=1`, um solche Tests auszuführen, z.B:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+Einige Dekoratoren wie `@parameterized` schreiben Testnamen um, daher müssen `@slow` und die übrigen Skip-Dekoratoren
+`@require_*` müssen als letztes aufgeführt werden, damit sie korrekt funktionieren. Hier ist ein Beispiel für die korrekte Verwendung:
+
+```python no-style
+@parameterized.expand(...)
+@slow
+def test_integration_foo():
+```
+
+Wie zu Beginn dieses Dokuments erläutert, werden langsame Tests nach einem Zeitplan ausgeführt und nicht in PRs CI
+Prüfungen. Es ist also möglich, dass einige Probleme bei der Einreichung eines PRs übersehen werden und zusammengeführt werden. Solche Probleme werden
+werden beim nächsten geplanten CI-Job abgefangen. Das bedeutet aber auch, dass es wichtig ist, die langsamen Tests auf Ihrem
+Rechner auszuführen, bevor Sie den PR einreichen.
+
+Hier ist ein grober Entscheidungsmechanismus für die Auswahl der Tests, die als langsam markiert werden sollen:
+
+Wenn der Test auf eine der internen Komponenten der Bibliothek ausgerichtet ist (z.B. Modellierungsdateien, Tokenisierungsdateien,
+Pipelines), dann sollten wir diesen Test in der nicht langsamen Testsuite ausführen. Wenn er sich auf einen anderen Aspekt der Bibliothek bezieht,
+wie z.B. die Dokumentation oder die Beispiele, dann sollten wir diese Tests in der langsamen Testsuite durchführen. Und dann, zur Verfeinerung
+Ansatz zu verfeinern, sollten wir Ausnahmen einführen:
+
+- Alle Tests, die einen umfangreichen Satz von Gewichten oder einen Datensatz mit einer Größe von mehr als ~50MB herunterladen müssen (z.B. Modell- oder
+  Tokenizer-Integrationstests, Pipeline-Integrationstests) sollten auf langsam gesetzt werden. Wenn Sie ein neues Modell hinzufügen, sollten Sie
+  sollten Sie eine kleine Version des Modells (mit zufälligen Gewichtungen) für Integrationstests erstellen und in den Hub hochladen. Dies wird
+  wird in den folgenden Abschnitten erläutert.
+- Alle Tests, die ein Training durchführen müssen, das nicht speziell auf Schnelligkeit optimiert ist, sollten auf langsam gesetzt werden.
+- Wir können Ausnahmen einführen, wenn einige dieser Tests, die nicht langsam sein sollten, unerträglich langsam sind, und sie auf
+  `@slow`. Auto-Modellierungstests, die große Dateien auf der Festplatte speichern und laden, sind ein gutes Beispiel für Tests, die als
+  als `@slow` markiert sind.
+- Wenn ein Test in weniger als 1 Sekunde auf CI abgeschlossen wird (einschließlich eventueller Downloads), sollte es sich trotzdem um einen normalen Test handeln.
+
+Insgesamt müssen alle nicht langsamen Tests die verschiedenen Interna abdecken und dabei schnell bleiben. Zum Beispiel,
+kann eine signifikante Abdeckung erreicht werden, indem Sie mit speziell erstellten kleinen Modellen mit zufälligen Gewichten testen. Solche Modelle
+haben eine sehr geringe Anzahl von Schichten (z.B. 2), Vokabeln (z.B. 1000), usw. Dann können die `@slow`-Tests große
+langsame Modelle verwenden, um qualitative Tests durchzuführen. Um die Verwendung dieser Modelle zu sehen, suchen Sie einfach nach *winzigen* Modellen mit:
+
+```bash
+grep tiny tests examples
+```
+
+Hier ist ein Beispiel für ein [Skript](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py), das das winzige Modell erstellt hat
+[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). Sie können es ganz einfach an Ihre eigene
+Architektur Ihres Modells anpassen.
+
+Es ist leicht, die Laufzeit falsch zu messen, wenn zum Beispiel ein großes Modell heruntergeladen wird, aber wenn
+Sie es lokal testen, würden die heruntergeladenen Dateien zwischengespeichert und somit die Download-Zeit nicht gemessen werden. Prüfen Sie daher den
+Ausführungsgeschwindigkeitsbericht in den CI-Protokollen (die Ausgabe von `pytest --durations=0 tests`).
+
+Dieser Bericht ist auch nützlich, um langsame Ausreißer zu finden, die nicht als solche gekennzeichnet sind oder die neu geschrieben werden müssen, um schnell zu sein.
+Wenn Sie bemerken, dass die Testsuite beim CI langsam wird, zeigt die oberste Liste dieses Berichts die langsamsten
+Tests.
+
+
+### Testen der stdout/stderr-Ausgabe
+
+Um Funktionen zu testen, die in `stdout` und/oder `stderr` schreiben, kann der Test auf diese Ströme zugreifen, indem er die
+[capsys system](https://docs.pytest.org/en/latest/capture.html) von `pytest` zugreifen. So wird dies bewerkstelligt:
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # consume the captured output streams
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # test:
+    assert msg in out
+    assert msg in err
+```
+
+Und natürlich wird `stderr` in den meisten Fällen als Teil einer Ausnahme auftreten, so dass try/except in einem solchen Fall verwendet werden muss
+Fall verwendet werden:
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+Ein anderer Ansatz zur Erfassung von stdout ist `contextlib.redirect_stdout`:
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    # test:
+    assert msg in out
+```
+
+Ein wichtiges potenzielles Problem beim Erfassen von stdout ist, dass es `r` Zeichen enthalten kann, die bei normalem `print`
+alles zurücksetzen, was bisher gedruckt wurde. Mit `pytest` gibt es kein Problem, aber mit `pytest -s` werden diese
+werden diese Zeichen in den Puffer aufgenommen. Um den Test mit und ohne `-s` laufen zu lassen, müssen Sie also eine zusätzliche Bereinigung
+zusätzliche Bereinigung der erfassten Ausgabe vornehmen, indem Sie `re.sub(r'~.*\r', '', buf, 0, re.M)` verwenden.
+
+Aber dann haben wir einen Hilfskontextmanager-Wrapper, der sich automatisch um alles kümmert, unabhängig davon, ob er
+einige "*.*.*.*" enthält oder nicht:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+Hier ist ein vollständiges Testbeispiel:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+Wenn Sie `stderr` aufzeichnen möchten, verwenden Sie stattdessen die Klasse `CaptureStderr`:
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+Wenn Sie beide Streams auf einmal erfassen müssen, verwenden Sie die übergeordnete Klasse `CaptureStd`:
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+Um das Debuggen von Testproblemen zu erleichtern, geben diese Kontextmanager standardmäßig die aufgezeichneten Streams beim Verlassen
+aus dem Kontext wieder.
+
+
+### Erfassen von Logger-Streams
+
+Wenn Sie die Ausgabe eines Loggers validieren müssen, können Sie `CaptureLogger` verwenden:
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### Testen mit Umgebungsvariablen
+
+Wenn Sie die Auswirkungen von Umgebungsvariablen für einen bestimmten Test testen möchten, können Sie einen Hilfsdekorator verwenden
+`transformers.testing_utils.mockenv`
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+Manchmal muss ein externes Programm aufgerufen werden, was die Einstellung von `PYTHONPATH` in `os.environ` erfordert, um mehrere lokale Pfade einzuschließen.
+mehrere lokale Pfade. Eine Hilfsklasse `transformers.test_utils.TestCasePlus` hilft Ihnen dabei:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # now call the external program, passing `env` to it
+```
+
+Je nachdem, ob die Testdatei in der Testsuite `tests` oder in `examples` war, wird sie korrekt eingerichtet
+`env[PYTHONPATH]` eines dieser beiden Verzeichnisse und auch das `src` Verzeichnis, um sicherzustellen, dass der Test gegen das aktuelle
+um sicherzustellen, dass der Test mit dem aktuellen Projektarchiv durchgeführt wird, und schließlich mit dem, was in `env[PYTHONPATH]` bereits eingestellt war, bevor der Test aufgerufen wurde.
+wenn überhaupt.
+
+Diese Hilfsmethode erstellt eine Kopie des Objekts `os.environ`, so dass das Original intakt bleibt.
+
+
+### Reproduzierbare Ergebnisse erhalten
+
+In manchen Situationen möchten Sie vielleicht die Zufälligkeit Ihrer Tests beseitigen. Um identische, reproduzierbare Ergebnisse zu erhalten, müssen Sie
+müssen Sie den Seed festlegen:
+
+```python
+seed = 42
+
+# python RNG
+import random
+
+random.seed(seed)
+
+# pytorch RNGs
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# numpy RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# tf RNG
+tf.random.set_seed(seed)
+```
+
+### Tests debuggen
+
+Um einen Debugger an der Stelle zu starten, an der die Warnung auftritt, gehen Sie wie folgt vor:
+
+```bash
+pytest tests/utils/test_logging.py -W error::UserWarning --pdb
+```
+
+## Arbeiten mit Github-Aktionen-Workflows
+
+Um einen CI-Job für einen Self-Push-Workflow auszulösen, müssen Sie:
+
+1. Erstellen Sie einen neuen Zweig auf `transformers` Ursprung (keine Gabelung!).
+2. Der Name der Verzweigung muss entweder mit `ci_` oder `ci-` beginnen (`main` löst ihn auch aus, aber wir können keine PRs auf
+   `main`). Es wird auch nur für bestimmte Pfade ausgelöst - Sie können die aktuelle Definition finden, falls sie
+   falls sie sich seit der Erstellung dieses Dokuments geändert hat [hier](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) unter *push:*
+3. Erstellen Sie einen PR von diesem Zweig.
+4. Dann können Sie sehen, wie der Job erscheint [hier](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). Er wird möglicherweise nicht sofort ausgeführt, wenn es
+   ein Backlog vorhanden ist.
+
+
+
+
+## Testen experimenteller CI-Funktionen
+
+Das Testen von CI-Funktionen kann potenziell problematisch sein, da es die normale CI-Funktion beeinträchtigen kann. Wenn also eine
+neue CI-Funktion hinzugefügt werden soll, sollte dies wie folgt geschehen.
+
+1. Erstellen Sie einen neuen Auftrag, der die zu testende Funktion testet.
+2. Der neue Job muss immer erfolgreich sein, so dass er uns ein grünes ✓ gibt (Details unten).
+3. Lassen Sie ihn einige Tage lang laufen, um zu sehen, dass eine Vielzahl verschiedener PR-Typen darauf laufen (Benutzer-Gabelzweige,
+   nicht geforkte Zweige, Zweige, die von github.com UI direct file edit stammen, verschiedene erzwungene Pushes, etc. - es gibt
+   es gibt so viele), während Sie die Protokolle des experimentellen Jobs überwachen (nicht den gesamten Job grün, da er absichtlich immer
+   grün)
+4. Wenn klar ist, dass alles in Ordnung ist, fügen Sie die neuen Änderungen in die bestehenden Jobs ein.
+
+Auf diese Weise wird der normale Arbeitsablauf nicht durch Experimente mit der CI-Funktionalität selbst beeinträchtigt.
+
+Wie können wir nun dafür sorgen, dass der Auftrag immer erfolgreich ist, während die neue CI-Funktion entwickelt wird?
+
+Einige CIs, wie TravisCI, unterstützen ignore-step-failure und melden den gesamten Job als erfolgreich, aber CircleCI und
+Github Actions unterstützen dies zum jetzigen Zeitpunkt nicht.
+
+Sie können also die folgende Abhilfe verwenden:
+
+1. Setzen Sie `set +euo pipefail` am Anfang des Ausführungsbefehls, um die meisten potenziellen Fehler im Bash-Skript zu unterdrücken.
+2. Der letzte Befehl muss ein Erfolg sein: `echo "done"` oder einfach `true` reicht aus.
+
+Hier ist ein Beispiel:
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+Für einfache Befehle können Sie auch Folgendes tun:
+
+```bash
+cmd_that_may_fail || true
+```
+
+Wenn Sie mit den Ergebnissen zufrieden sind, integrieren Sie den experimentellen Schritt oder Job natürlich in den Rest der normalen Jobs,
+Entfernen Sie dabei `set +euo pipefail` oder andere Dinge, die Sie eventuell hinzugefügt haben, um sicherzustellen, dass der experimentelle Auftrag nicht
+den normalen CI-Betrieb nicht beeinträchtigt.
+
+Dieser ganze Prozess wäre viel einfacher gewesen, wenn wir nur etwas wie `allow-failure` für den
+experimentellen Schritt festlegen könnten und ihn scheitern lassen würden, ohne den Gesamtstatus der PRs zu beeinträchtigen. Aber wie bereits erwähnt, haben CircleCI und
+Github Actions dies im Moment nicht unterstützen.
+
+Sie können in diesen CI-spezifischen Threads für diese Funktion stimmen und sehen, wo sie steht:
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
diff --git a/docs/transformers/docs/source/de/training.md b/docs/transformers/docs/source/de/training.md
new file mode 100644
index 0000000000000000000000000000000000000000..806a380b6cebc9e729581e70b5309ade5bf33159
--- /dev/null
+++ b/docs/transformers/docs/source/de/training.md
@@ -0,0 +1,433 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimierung eines vortrainierten Modells
+
+[[open-in-colab]]
+
+Die Verwendung eines vorab trainierten Modells hat erhebliche Vorteile. Es reduziert die Rechenkosten und den CO2-Fußabdruck und ermöglicht Ihnen die Verwendung von Modellen, die dem neuesten Stand der Technik entsprechen, ohne dass Sie ein Modell von Grund auf neu trainieren müssen. Transformers bietet Zugang zu Tausenden von vortrainierten Modellen für eine Vielzahl von Aufgaben. Wenn Sie ein vorab trainiertes Modell verwenden, trainieren Sie es auf einem für Ihre Aufgabe spezifischen Datensatz. Dies wird als Feinabstimmung bezeichnet und ist eine unglaublich leistungsfähige Trainingstechnik. In diesem Tutorial werden Sie ein vortrainiertes Modell mit einem Deep-Learning-Framework Ihrer Wahl feinabstimmen:
+
+* Feinabstimmung eines vorab trainierten Modells mit 🤗 Transformers [`Trainer`].
+* Feinabstimmung eines vorab trainierten Modells in TensorFlow mit Keras.
+* Feinabstimmung eines vorab trainierten Modells in nativem PyTorch.
+
+<a id='data-processing'></a>
+
+## Vorbereitung eines Datensatzes
+
+<Youtube id="_BZearw7f0w"/>
+
+Bevor Sie die Feinabstimmung eines vortrainierten Modells vornehmen können, müssen Sie einen Datensatz herunterladen und für das Training vorbereiten. Im vorangegangenen Leitfaden haben Sie gelernt, wie man Daten für das Training aufbereitet, und jetzt haben Sie die Gelegenheit, diese Fähigkeiten zu testen!
+
+Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets/yelp_review_full):
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+
+Wenn Sie möchten, können Sie eine kleinere Teilmenge des gesamten Datensatzes für die Feinabstimmung erstellen, um den Zeitaufwand zu verringern:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## Training
+
+An dieser Stelle sollten Sie dem Abschnitt folgen, der dem Rahmen entspricht, den Sie verwenden möchten. Sie können über die Links
+in der rechten Seitenleiste können Sie zu dem gewünschten Abschnitt springen - und wenn Sie den gesamten Inhalt eines bestimmten Frameworks ausblenden möchten,
+klicken Sie einfach auf die Schaltfläche oben rechts im Block des jeweiligen Frameworks!
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## Trainieren mit PyTorch Trainer
+
+🤗 Transformers bietet eine [`Trainer`]-Klasse, die für das Training von 🤗 Transformers-Modellen optimiert ist und es einfacher macht, mit dem Training zu beginnen, ohne manuell eine eigene Trainingsschleife zu schreiben. Die [`Trainer`]-API unterstützt eine breite Palette von Trainingsoptionen und Funktionen wie Logging, Gradientenakkumulation und gemischte Präzision.
+
+Beginnen Sie mit dem Laden Ihres Modells und geben Sie die Anzahl der erwarteten Labels an. Aus dem Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields) wissen Sie, dass es fünf Labels gibt:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+Es wird eine Warnung angezeigt, dass einige der trainierten Parameter nicht verwendet werden und einige Parameter zufällig
+initialisiert werden. Machen Sie sich keine Sorgen, das ist völlig normal! Der vorher trainierte Kopf des BERT-Modells wird verworfen und durch einen zufällig initialisierten Klassifikationskopf ersetzt. Sie werden diesen neuen Modellkopf in Ihrer Sequenzklassifizierungsaufgabe feinabstimmen, indem Sie das Wissen des vortrainierten Modells auf ihn übertragen.
+
+</Tip>
+
+### Hyperparameter für das Training
+
+Als Nächstes erstellen Sie eine Klasse [`TrainingArguments`], die alle Hyperparameter enthält, die Sie einstellen können, sowie Flags zur Aktivierung verschiedener Trainingsoptionen. Für dieses Lernprogramm können Sie mit den Standard- [Hyperparametern](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) beginnen, aber Sie können mit diesen experimentieren, um Ihre optimalen Einstellungen zu finden.
+
+Geben Sie an, wo die Kontrollpunkte Ihres Trainings gespeichert werden sollen:
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### Auswerten
+
+Der [`Trainer`] wertet die Leistung des Modells während des Trainings nicht automatisch aus. Sie müssen [`Trainer`] eine Funktion übergeben, um Metriken zu berechnen und zu berichten. Die [🤗 Evaluate](https://huggingface.co/docs/evaluate/index) Bibliothek bietet eine einfache [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) Funktion, die Sie mit der [`evaluate.load`] Funktion laden können (siehe diese [quicktour](https://huggingface.co/docs/evaluate/a_quick_tour) für weitere Informationen):
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+
+Rufen Sie [`~evaluate.compute`] auf `metric` auf, um die Genauigkeit Ihrer Vorhersagen zu berechnen. Bevor Sie Ihre Vorhersagen an `compute` übergeben, müssen Sie die Vorhersagen in Logits umwandeln (denken Sie daran, dass alle 🤗 Transformers-Modelle Logits zurückgeben):
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+Wenn Sie Ihre Bewertungsmetriken während der Feinabstimmung überwachen möchten, geben Sie den Parameter `eval_strategy` in Ihren Trainingsargumenten an, um die Bewertungsmetrik am Ende jeder Epoche zu ermitteln:
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")
+```
+
+### Trainer
+
+Erstellen Sie ein [`Trainer`]-Objekt mit Ihrem Modell, Trainingsargumenten, Trainings- und Testdatensätzen und einer Evaluierungsfunktion:
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+
+Anschließend können Sie Ihr Modell durch den Aufruf von [`~transformers.Trainer.train`] optimieren:
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## Trainieren Sie ein TensorFlow-Modell mit Keras
+
+Sie können auch 🤗 Transformers Modelle in TensorFlow mit der Keras API trainieren!
+
+### Laden von Daten für Keras
+
+Wenn Sie ein 🤗 Transformers Modell mit der Keras API trainieren wollen, müssen Sie Ihren Datensatz in ein Format konvertieren, das
+Keras versteht. Wenn Ihr Datensatz klein ist, können Sie das Ganze einfach in NumPy-Arrays konvertieren und an Keras übergeben.
+Probieren wir das zuerst aus, bevor wir etwas Komplizierteres tun.
+
+Laden Sie zunächst ein Dataset. Wir werden den CoLA-Datensatz aus dem [GLUE-Benchmark](https://huggingface.co/datasets/glue) verwenden,
+da es sich um eine einfache Aufgabe zur Klassifizierung von binärem Text handelt, und nehmen vorerst nur den Trainingssplit.
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+
+Als nächstes laden Sie einen Tokenizer und tokenisieren die Daten als NumPy-Arrays. Beachten Sie, dass die Beschriftungen bereits eine Liste von 0 und 1en sind,
+Wir können sie also ohne Tokenisierung direkt in ein NumPy-Array konvertieren!
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+tokenized_data = tokenizer(dataset["text"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+
+Schließlich laden, [`compile`](https://keras.io/api/models/model_training_apis/#compile-method) und [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) Sie das Modell:
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+Sie müssen Ihren Modellen kein Verlustargument übergeben, wenn Sie sie `compile()`! Hugging-Face-Modelle wählen automatisch
+einen Loss, der für ihre Aufgabe und Modellarchitektur geeignet ist, wenn dieses Argument leer gelassen wird. Sie können jederzeit außer Kraft setzen, indem Sie selbst einen Loss angeben, wenn Sie das möchten!
+
+</Tip>
+
+Dieser Ansatz eignet sich hervorragend für kleinere Datensätze, aber bei größeren Datensätzen kann er zu einem Problem werden. Warum?
+Weil das tokenisierte Array und die Beschriftungen vollständig in den Speicher geladen werden müssten, und weil NumPy nicht mit
+"gezackte" Arrays nicht verarbeiten kann, so dass jedes tokenisierte Sample auf die Länge des längsten Samples im gesamten Datensatz aufgefüllt werden müsste.
+Datensatzes aufgefüllt werden. Dadurch wird das Array noch größer, und all die aufgefüllten Token verlangsamen auch das Training!
+
+### Laden von Daten als tf.data.Dataset
+
+Wenn Sie eine Verlangsamung des Trainings vermeiden wollen, können Sie Ihre Daten stattdessen als `tf.data.Dataset` laden. Sie können zwar Ihre eigene
+tf.data"-Pipeline schreiben können, wenn Sie wollen, haben wir zwei bequeme Methoden, um dies zu tun:
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]: Dies ist die Methode, die wir in den meisten Fällen empfehlen. Da es sich um eine Methode
+Ihres Modells ist, kann sie das Modell inspizieren, um automatisch herauszufinden, welche Spalten als Modelleingaben verwendet werden können, und
+verwirft die anderen, um einen einfacheren, leistungsfähigeren Datensatz zu erstellen.
+- [`~datasets.Dataset.to_tf_dataset`]: Diese Methode ist eher auf niedriger Ebene angesiedelt und ist nützlich, wenn Sie genau kontrollieren wollen, wie
+Dataset erstellt wird, indem man genau angibt, welche `columns` und `label_cols` einbezogen werden sollen.
+
+Bevor Sie [`~TFPreTrainedModel.prepare_tf_dataset`] verwenden können, müssen Sie die Tokenizer-Ausgaben als Spalten zu Ihrem Datensatz hinzufügen, wie in
+dem folgenden Codebeispiel:
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+
+Denken Sie daran, dass Hugging Face-Datensätze standardmäßig auf der Festplatte gespeichert werden, so dass dies nicht zu einem erhöhten Arbeitsspeicherbedarf führen wird! Sobald die
+Spalten hinzugefügt wurden, können Sie Batches aus dem Datensatz streamen und zu jedem Batch Auffüllungen hinzufügen, was die Anzahl der Auffüllungs-Token im Vergleich zum Auffüllen des gesamten Datensatzes reduziert.
+
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+
+Beachten Sie, dass Sie im obigen Codebeispiel den Tokenizer an `prepare_tf_dataset` übergeben müssen, damit die Stapel beim Laden korrekt aufgefüllt werden können.
+Wenn alle Stichproben in Ihrem Datensatz die gleiche Länge haben und kein Auffüllen erforderlich ist, können Sie dieses Argument weglassen.
+Wenn Sie etwas Komplexeres als nur das Auffüllen von Stichproben benötigen (z. B. das Korrumpieren von Token für die maskierte Sprachmodellierung), können Sie das Argument
+Modellierung), können Sie stattdessen das Argument `collate_fn` verwenden, um eine Funktion zu übergeben, die aufgerufen wird, um die
+Liste von Stichproben in einen Stapel umwandelt und alle gewünschten Vorverarbeitungen vornimmt. Siehe unsere
+[examples](https://github.com/huggingface/transformers/tree/main/examples) oder
+[notebooks](https://huggingface.co/docs/transformers/notebooks), um diesen Ansatz in Aktion zu sehen.
+
+Sobald Sie einen `tf.data.Dataset` erstellt haben, können Sie das Modell wie zuvor kompilieren und anpassen:
+
+```py
+model.compile(optimizer=Adam(3e-5))
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## Trainieren in nativem PyTorch
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] kümmert sich um die Trainingsschleife und ermöglicht die Feinabstimmung eines Modells in einer einzigen Codezeile. Für Benutzer, die es vorziehen, ihre eigene Trainingsschleife zu schreiben, können Sie auch eine Feinabstimmung eines 🤗 Transformers-Modells in nativem PyTorch vornehmen.
+
+An diesem Punkt müssen Sie möglicherweise Ihr Notebook neu starten oder den folgenden Code ausführen, um etwas Speicher freizugeben:
+
+```py
+del model
+del pytorch_model
+del trainer
+torch.cuda.empty_cache()
+```
+
+Als Nächstes müssen Sie den Datensatz `tokenized_dataset` manuell nachbearbeiten, um ihn für das Training vorzubereiten.
+
+1. Entfernen Sie die Spalte "Text", da das Modell keinen Rohtext als Eingabe akzeptiert:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. Benennen Sie die Spalte "Label" in "Labels" um, da das Modell erwartet, dass das Argument "Labels" genannt wird:
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. Stellen Sie das Format des Datensatzes so ein, dass PyTorch-Tensoren anstelle von Listen zurückgegeben werden:
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+Erstellen Sie dann eine kleinere Teilmenge des Datensatzes, wie zuvor gezeigt, um die Feinabstimmung zu beschleunigen:
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+Erstellen Sie einen `DataLoader` für Ihre Trainings- und Testdatensätze, damit Sie über die Datenstapel iterieren können:
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+Laden Sie Ihr Modell mit der Anzahl der erwarteten Kennzeichnungen:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
+```
+
+### Optimierer und Lernratensteuerung
+
+Erstellen Sie einen Optimierer und einen Scheduler für die Lernrate, um das Modell fein abzustimmen. Wir verwenden den Optimierer [`AdamW`](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) aus PyTorch:
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+Erstellen Sie den Standard-Lernratenplaner aus [`Trainer`]:
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+Geben Sie schließlich `device` an, um einen Grafikprozessor zu verwenden, wenn Sie Zugang zu einem solchen haben. Andernfalls kann das Training auf einer CPU mehrere Stunden statt ein paar Minuten dauern.
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+Holen Sie sich mit einem gehosteten Notebook wie [Colaboratory](https://colab.research.google.com/) oder [SageMaker StudioLab](https://studiolab.sagemaker.aws/) kostenlosen Zugang zu einem Cloud-GPU, wenn Sie noch keinen haben.
+
+</Tip>
+
+Großartig, Sie sind bereit für das Training! 🥳 
+
+### Trainingsschleife
+
+Um Ihren Trainingsfortschritt zu verfolgen, verwenden Sie die [tqdm](https://tqdm.github.io/) Bibliothek, um einen Fortschrittsbalken über die Anzahl der Trainingsschritte hinzuzufügen:
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### Auswertung
+
+Genauso wie Sie eine Bewertungsfunktion zu [`Trainer`] hinzugefügt haben, müssen Sie dasselbe tun, wenn Sie Ihre eigene Trainingsschleife schreiben. Aber anstatt die Metrik am Ende jeder Epoche zu berechnen und zu melden, werden Sie dieses Mal alle Stapel mit [`~evaluate.add_batch`] akkumulieren und die Metrik ganz am Ende berechnen.
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## Zusätzliche Ressourcen
+
+Weitere Beispiele für die Feinabstimmung finden Sie unter:
+
+- [🤗 Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) enthält Skripte
+  um gängige NLP-Aufgaben in PyTorch und TensorFlow zu trainieren.
+
+- [🤗 Transformers Notebooks](notebooks) enthält verschiedene Notebooks zur Feinabstimmung eines Modells für bestimmte Aufgaben in PyTorch und TensorFlow.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/en/_config.py b/docs/transformers/docs/source/en/_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49e4e4731965a504b8da443c2cd979638cd22bb
--- /dev/null
+++ b/docs/transformers/docs/source/en/_config.py
@@ -0,0 +1,14 @@
+# docstyle-ignore
+INSTALL_CONTENT = """
+# Transformers installation
+! pip install transformers datasets evaluate accelerate
+# To install from source instead of the last release, comment the command above and uncomment the following one.
+# ! pip install git+https://github.com/huggingface/transformers.git
+"""
+
+notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
+black_avoid_patterns = {
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
diff --git a/docs/transformers/docs/source/en/_redirects.yml b/docs/transformers/docs/source/en/_redirects.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ff70547c7228419daa3e1b048c7256ec8a5deae1
--- /dev/null
+++ b/docs/transformers/docs/source/en/_redirects.yml
@@ -0,0 +1,5 @@
+# Optimizing inference
+
+perf_infer_gpu_many: perf_infer_gpu_one
+transformers_agents: agents
+quantization: quantization/overview
diff --git a/docs/transformers/docs/source/en/_toctree.yml b/docs/transformers/docs/source/en/_toctree.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4399b02733c09d166d507bb31d437252ba7a64bb
--- /dev/null
+++ b/docs/transformers/docs/source/en/_toctree.yml
@@ -0,0 +1,1102 @@
+- sections:
+  - local: index
+    title: Transformers
+  - local: installation
+    title: Installation
+  - local: quicktour
+    title: Quickstart
+  title: Get started
+- isExpanded: false
+  sections:
+  - sections:
+    - local: models
+      title: Loading models
+    - local: custom_models
+      title: Customizing models
+    - local: how_to_hack_models
+      title: Customizing model components
+    - local: model_sharing
+      title: Sharing
+    - local: add_new_model
+      title: Adding a new model to Transformers
+    - local: modular_transformers
+      title: Modular Transformers
+    - local: task_summary
+      title: What 🤗 Transformers can do
+    - local: tasks_explained
+      title: How 🤗 Transformers solve tasks
+    - local: model_summary
+      title: The Transformer model family
+    - local: attention
+      title: Attention mechanisms
+    - local: attention_interface
+      title: Customizing attention function
+    title: Models
+  - sections:
+    - local: fast_tokenizers
+      title: Tokenizers
+    - local: image_processors
+      title: Image processors
+    - local: backbones
+      title: Backbones
+    - local: feature_extractors
+      title: Feature extractors
+    - local: processors
+      title: Processors
+    - local: tokenizer_summary
+      title: Summary of the tokenizers
+    - local: pad_truncation
+      title: Padding and truncation
+    title: Preprocessors
+  title: Base classes
+- isExpanded: false
+  sections:
+  - sections:
+    - local: pipeline_tutorial
+      title: Pipeline
+    - local: pipeline_gradio
+      title: Machine learning apps
+    - local: pipeline_webserver
+      title: Web server inference
+    - local: add_new_pipeline
+      title: Adding a new pipeline
+    title: Pipeline API
+  - sections:
+    - local: llm_tutorial
+      title: Text generation
+    - local: generation_strategies
+      title: Generation strategies
+    - local: generation_features
+      title: Generation features
+    - local: tasks/prompting
+      title: Prompt engineering
+    - local: llm_optims
+      title: Optimizing inference
+    - local: kv_cache
+      title: KV cache strategies
+    - local: serving
+      title: Serving
+    - local: cache_explanation
+      title: Caching
+    - local: llm_tutorial_optimization
+      title: Getting the most out of LLMs
+    - local: perplexity
+      title: Perplexity of fixed-length models
+    title: LLMs
+  - sections:
+    - local: conversations
+      title: Chat basics
+    - local: chat_templating
+      title: Templates
+    - local: chat_templating_multimodal
+      title: Multimodal templates
+    - local: chat_templating_writing
+      title: Template writing
+    - local: chat_extras
+      title: Tools and RAG
+    title: Chat with models
+  - sections:
+    - local: perf_torch_compile
+      title: torch.compile
+    - local: perf_infer_gpu_one
+      title: GPU
+    - local: perf_infer_gpu_multi
+      title: Distributed GPU inference
+    - local: perf_infer_cpu
+      title: CPU
+    - local: tf_xla
+      title: XLA
+    title: Optimization
+  - local: agents
+    title: Agents
+  - local: tools
+    title: Tools
+  title: Inference
+- isExpanded: false
+  sections:
+  - sections:
+    - local: trainer
+      title: Trainer
+    - local: training
+      title: Fine-tuning
+    - local: optimizers
+      title: Optimizers
+    - local: hpo_train
+      title: Hyperparameter search
+    title: Trainer API
+  - sections:
+    - local: gpu_selection
+      title: GPU selection
+    - local: accelerate
+      title: Accelerate
+    - local: fsdp
+      title: FullyShardedDataParallel
+    - local: deepspeed
+      title: DeepSpeed
+    - local: debugging
+      title: Multi-GPU debugging
+    - local: perf_train_cpu_many
+      title: Distributed CPUs
+    - local: perf_train_gpu_many
+      title: Parallelism methods
+    title: Distributed training
+  - sections:
+    - local: perf_train_gpu_one
+      title: GPU
+    - local: perf_train_cpu
+      title: CPU
+    - local: perf_train_tpu_tf
+      title: TPU
+    - local: perf_train_special
+      title: Apple Silicon
+    - local: perf_hardware
+      title: Build your own machine
+    title: Hardware
+  - local: peft
+    title: PEFT
+  - local: model_memory_anatomy
+    title: Model training anatomy
+  title: Training
+- isExpanded: false
+  sections:
+  - local: quantization/overview
+    title: Overview
+  - local: quantization/selecting
+    title: Selecting a quantization method
+  - local: quantization/concept_guide
+    title: Quantization concepts
+  - local: quantization/aqlm
+    title: AQLM
+  - local: quantization/auto_round
+    title: AutoRound
+  - local: quantization/awq
+    title: AWQ
+  - local: quantization/bitnet
+    title: BitNet
+  - local: quantization/bitsandbytes
+    title: bitsandbytes
+  - local: quantization/compressed_tensors
+    title: compressed-tensors
+  - local: quantization/eetq
+    title: EETQ
+  - local: quantization/fbgemm_fp8
+    title: FBGEMM
+  - local: quantization/finegrained_fp8
+    title: Fine-grained FP8
+  - local: gguf
+    title: GGUF
+  - local: quantization/gptq
+    title: GPTQ
+  - local: quantization/higgs
+    title: HIGGS
+  - local: quantization/hqq
+    title: HQQ
+  - local: quantization/optimum
+    title: Optimum
+  - local: quantization/quanto
+    title: Quanto
+  - local: quantization/quark
+    title: Quark
+  - local: quantization/torchao
+    title: torchao
+  - local: quantization/spqr
+    title: SpQR
+  - local: quantization/vptq
+    title: VPTQ
+  - local: quantization/contribute
+    title: Contribute
+  title: Quantization
+- isExpanded: false
+  sections:
+  - local: serialization
+    title: ONNX
+  - local: tflite
+    title: LiteRT
+  - local: executorch
+    title: ExecuTorch
+  - local: torchscript
+    title: TorchScript
+  title: Export to production
+- isExpanded: false
+  sections:
+  - sections:
+    - sections:
+      - local: tasks/sequence_classification
+        title: Text classification
+      - local: tasks/token_classification
+        title: Token classification
+      - local: tasks/question_answering
+        title: Question answering
+      - local: tasks/language_modeling
+        title: Causal language modeling
+      - local: tasks/masked_language_modeling
+        title: Masked language modeling
+      - local: tasks/translation
+        title: Translation
+      - local: tasks/summarization
+        title: Summarization
+      - local: tasks/multiple_choice
+        title: Multiple choice
+      title: Natural language processing
+    - sections:
+      - local: tasks/audio_classification
+        title: Audio classification
+      - local: tasks/asr
+        title: Automatic speech recognition
+      title: Audio
+    - sections:
+      - local: tasks/image_classification
+        title: Image classification
+      - local: tasks/semantic_segmentation
+        title: Image segmentation
+      - local: tasks/video_classification
+        title: Video classification
+      - local: tasks/object_detection
+        title: Object detection
+      - local: tasks/zero_shot_object_detection
+        title: Zero-shot object detection
+      - local: tasks/zero_shot_image_classification
+        title: Zero-shot image classification
+      - local: tasks/monocular_depth_estimation
+        title: Depth estimation
+      - local: tasks/image_to_image
+        title: Image-to-Image
+      - local: tasks/image_feature_extraction
+        title: Image Feature Extraction
+      - local: tasks/mask_generation
+        title: Mask Generation
+      - local: tasks/keypoint_detection
+        title: Keypoint detection
+      - local: tasks/knowledge_distillation_for_image_classification
+        title: Knowledge Distillation for Computer Vision
+      title: Computer vision
+    - sections:
+      - local: tasks/image_captioning
+        title: Image captioning
+      - local: tasks/document_question_answering
+        title: Document Question Answering
+      - local: tasks/visual_question_answering
+        title: Visual Question Answering
+      - local: tasks/text-to-speech
+        title: Text to speech
+      - local: tasks/idefics
+        title: Image tasks with IDEFICS
+      - local: tasks/image_text_to_text
+        title: Image-text-to-text
+      - local: tasks/video_text_to_text
+        title: Video-text-to-text
+      - local: tasks/visual_document_retrieval
+        title: Visual Document Retrieval
+      title: Multimodal
+    title: Task recipes
+  - local: run_scripts
+    title: Training scripts
+  - local: glossary
+    title: Glossary
+  - local: philosophy
+    title: Philosophy
+  - local: notebooks
+    title: Notebooks with examples
+  - local: community
+    title: Community resources
+  - local: troubleshooting
+    title: Troubleshoot
+  title: Resources
+- isExpanded: false
+  sections:
+  - local: contributing
+    title: Contribute to Transformers
+  - local: testing
+    title: Transformers model tests
+  - local: pr_checks
+    title: Pull request checks
+  title: Contribute
+- isExpanded: false
+  sections:
+  - sections:
+    - local: model_doc/auto
+      title: Auto Classes
+    - local: main_classes/backbones
+      title: Backbones
+    - local: main_classes/callback
+      title: Callbacks
+    - local: main_classes/configuration
+      title: Configuration
+    - local: main_classes/data_collator
+      title: Data Collator
+    - local: main_classes/keras_callbacks
+      title: Keras callbacks
+    - local: main_classes/logging
+      title: Logging
+    - local: main_classes/model
+      title: Models
+    - local: main_classes/text_generation
+      title: Text Generation
+    - local: main_classes/onnx
+      title: ONNX
+    - local: main_classes/optimizer_schedules
+      title: Optimization
+    - local: main_classes/output
+      title: Model outputs
+    - local: main_classes/peft
+      title: PEFT
+    - local: main_classes/pipelines
+      title: Pipelines
+    - local: main_classes/processors
+      title: Processors
+    - local: main_classes/quantization
+      title: Quantization
+    - local: main_classes/tokenizer
+      title: Tokenizer
+    - local: main_classes/trainer
+      title: Trainer
+    - local: main_classes/deepspeed
+      title: DeepSpeed
+    - local: main_classes/executorch
+      title: ExecuTorch
+    - local: main_classes/feature_extractor
+      title: Feature Extractor
+    - local: main_classes/image_processor
+      title: Image Processor
+    title: Main classes
+  - sections:
+    - sections:
+      - local: model_doc/albert
+        title: ALBERT
+      - local: model_doc/bamba
+        title: Bamba
+      - local: model_doc/bart
+        title: BART
+      - local: model_doc/barthez
+        title: BARThez
+      - local: model_doc/bartpho
+        title: BARTpho
+      - local: model_doc/bert
+        title: BERT
+      - local: model_doc/bert-generation
+        title: BertGeneration
+      - local: model_doc/bert-japanese
+        title: BertJapanese
+      - local: model_doc/bertweet
+        title: Bertweet
+      - local: model_doc/big_bird
+        title: BigBird
+      - local: model_doc/bigbird_pegasus
+        title: BigBirdPegasus
+      - local: model_doc/biogpt
+        title: BioGpt
+      - local: model_doc/blenderbot
+        title: Blenderbot
+      - local: model_doc/blenderbot-small
+        title: Blenderbot Small
+      - local: model_doc/bloom
+        title: BLOOM
+      - local: model_doc/bort
+        title: BORT
+      - local: model_doc/byt5
+        title: ByT5
+      - local: model_doc/camembert
+        title: CamemBERT
+      - local: model_doc/canine
+        title: CANINE
+      - local: model_doc/codegen
+        title: CodeGen
+      - local: model_doc/code_llama
+        title: CodeLlama
+      - local: model_doc/cohere
+        title: Cohere
+      - local: model_doc/cohere2
+        title: Cohere2
+      - local: model_doc/convbert
+        title: ConvBERT
+      - local: model_doc/cpm
+        title: CPM
+      - local: model_doc/cpmant
+        title: CPMANT
+      - local: model_doc/ctrl
+        title: CTRL
+      - local: model_doc/dbrx
+        title: DBRX
+      - local: model_doc/deberta
+        title: DeBERTa
+      - local: model_doc/deberta-v2
+        title: DeBERTa-v2
+      - local: model_doc/deepseek_v3
+        title: DeepSeek-V3
+      - local: model_doc/dialogpt
+        title: DialoGPT
+      - local: model_doc/diffllama
+        title: DiffLlama
+      - local: model_doc/distilbert
+        title: DistilBERT
+      - local: model_doc/dpr
+        title: DPR
+      - local: model_doc/electra
+        title: ELECTRA
+      - local: model_doc/encoder-decoder
+        title: Encoder Decoder Models
+      - local: model_doc/ernie
+        title: ERNIE
+      - local: model_doc/ernie_m
+        title: ErnieM
+      - local: model_doc/esm
+        title: ESM
+      - local: model_doc/falcon
+        title: Falcon
+      - local: model_doc/falcon3
+        title: Falcon3
+      - local: model_doc/falcon_mamba
+        title: FalconMamba
+      - local: model_doc/flan-t5
+        title: FLAN-T5
+      - local: model_doc/flan-ul2
+        title: FLAN-UL2
+      - local: model_doc/flaubert
+        title: FlauBERT
+      - local: model_doc/fnet
+        title: FNet
+      - local: model_doc/fsmt
+        title: FSMT
+      - local: model_doc/funnel
+        title: Funnel Transformer
+      - local: model_doc/fuyu
+        title: Fuyu
+      - local: model_doc/gemma
+        title: Gemma
+      - local: model_doc/gemma2
+        title: Gemma2
+      - local: model_doc/glm
+        title: GLM
+      - local: model_doc/glm4
+        title: glm4
+      - local: model_doc/openai-gpt
+        title: GPT
+      - local: model_doc/gpt_neo
+        title: GPT Neo
+      - local: model_doc/gpt_neox
+        title: GPT NeoX
+      - local: model_doc/gpt_neox_japanese
+        title: GPT NeoX Japanese
+      - local: model_doc/gptj
+        title: GPT-J
+      - local: model_doc/gpt2
+        title: GPT2
+      - local: model_doc/gpt_bigcode
+        title: GPTBigCode
+      - local: model_doc/gptsan-japanese
+        title: GPTSAN Japanese
+      - local: model_doc/gpt-sw3
+        title: GPTSw3
+      - local: model_doc/granite
+        title: Granite
+      - local: model_doc/granitemoe
+        title: GraniteMoe
+      - local: model_doc/granitemoeshared
+        title: GraniteMoeShared
+      - local: model_doc/helium
+        title: Helium
+      - local: model_doc/herbert
+        title: HerBERT
+      - local: model_doc/ibert
+        title: I-BERT
+      - local: model_doc/jamba
+        title: Jamba
+      - local: model_doc/jetmoe
+        title: JetMoe
+      - local: model_doc/jukebox
+        title: Jukebox
+      - local: model_doc/led
+        title: LED
+      - local: model_doc/llama
+        title: LLaMA
+      - local: model_doc/llama2
+        title: Llama2
+      - local: model_doc/llama3
+        title: Llama3
+      - local: model_doc/longformer
+        title: Longformer
+      - local: model_doc/longt5
+        title: LongT5
+      - local: model_doc/luke
+        title: LUKE
+      - local: model_doc/m2m_100
+        title: M2M100
+      - local: model_doc/madlad-400
+        title: MADLAD-400
+      - local: model_doc/mamba
+        title: Mamba
+      - local: model_doc/mamba2
+        title: mamba2
+      - local: model_doc/marian
+        title: MarianMT
+      - local: model_doc/markuplm
+        title: MarkupLM
+      - local: model_doc/mbart
+        title: MBart and MBart-50
+      - local: model_doc/mega
+        title: MEGA
+      - local: model_doc/megatron-bert
+        title: MegatronBERT
+      - local: model_doc/megatron_gpt2
+        title: MegatronGPT2
+      - local: model_doc/mistral
+        title: Mistral
+      - local: model_doc/mixtral
+        title: Mixtral
+      - local: model_doc/mluke
+        title: mLUKE
+      - local: model_doc/mobilebert
+        title: MobileBERT
+      - local: model_doc/modernbert
+        title: ModernBert
+      - local: model_doc/mpnet
+        title: MPNet
+      - local: model_doc/mpt
+        title: MPT
+      - local: model_doc/mra
+        title: MRA
+      - local: model_doc/mt5
+        title: MT5
+      - local: model_doc/mvp
+        title: MVP
+      - local: model_doc/myt5
+        title: myt5
+      - local: model_doc/nemotron
+        title: Nemotron
+      - local: model_doc/nezha
+        title: NEZHA
+      - local: model_doc/nllb
+        title: NLLB
+      - local: model_doc/nllb-moe
+        title: NLLB-MoE
+      - local: model_doc/nystromformer
+        title: Nyströmformer
+      - local: model_doc/olmo
+        title: OLMo
+      - local: model_doc/olmo2
+        title: OLMo2
+      - local: model_doc/olmoe
+        title: OLMoE
+      - local: model_doc/open-llama
+        title: Open-Llama
+      - local: model_doc/opt
+        title: OPT
+      - local: model_doc/pegasus
+        title: Pegasus
+      - local: model_doc/pegasus_x
+        title: PEGASUS-X
+      - local: model_doc/persimmon
+        title: Persimmon
+      - local: model_doc/phi
+        title: Phi
+      - local: model_doc/phi3
+        title: Phi-3
+      - local: model_doc/phimoe
+        title: PhiMoE
+      - local: model_doc/phobert
+        title: PhoBERT
+      - local: model_doc/plbart
+        title: PLBart
+      - local: model_doc/prophetnet
+        title: ProphetNet
+      - local: model_doc/qdqbert
+        title: QDQBert
+      - local: model_doc/qwen2
+        title: Qwen2
+      - local: model_doc/qwen2_moe
+        title: Qwen2MoE
+      - local: model_doc/qwen3
+        title: Qwen3
+      - local: model_doc/qwen3_moe
+        title: Qwen3MoE
+      - local: model_doc/rag
+        title: RAG
+      - local: model_doc/realm
+        title: REALM
+      - local: model_doc/recurrent_gemma
+        title: RecurrentGemma
+      - local: model_doc/reformer
+        title: Reformer
+      - local: model_doc/rembert
+        title: RemBERT
+      - local: model_doc/retribert
+        title: RetriBERT
+      - local: model_doc/roberta
+        title: RoBERTa
+      - local: model_doc/roberta-prelayernorm
+        title: RoBERTa-PreLayerNorm
+      - local: model_doc/roc_bert
+        title: RoCBert
+      - local: model_doc/roformer
+        title: RoFormer
+      - local: model_doc/rwkv
+        title: RWKV
+      - local: model_doc/splinter
+        title: Splinter
+      - local: model_doc/squeezebert
+        title: SqueezeBERT
+      - local: model_doc/stablelm
+        title: StableLm
+      - local: model_doc/starcoder2
+        title: Starcoder2
+      - local: model_doc/switch_transformers
+        title: SwitchTransformers
+      - local: model_doc/t5
+        title: T5
+      - local: model_doc/t5v1.1
+        title: T5v1.1
+      - local: model_doc/tapex
+        title: TAPEX
+      - local: model_doc/transfo-xl
+        title: Transformer XL
+      - local: model_doc/ul2
+        title: UL2
+      - local: model_doc/umt5
+        title: UMT5
+      - local: model_doc/xmod
+        title: X-MOD
+      - local: model_doc/xglm
+        title: XGLM
+      - local: model_doc/xlm
+        title: XLM
+      - local: model_doc/xlm-prophetnet
+        title: XLM-ProphetNet
+      - local: model_doc/xlm-roberta
+        title: XLM-RoBERTa
+      - local: model_doc/xlm-roberta-xl
+        title: XLM-RoBERTa-XL
+      - local: model_doc/xlm-v
+        title: XLM-V
+      - local: model_doc/xlnet
+        title: XLNet
+      - local: model_doc/yoso
+        title: YOSO
+      - local: model_doc/zamba
+        title: Zamba
+      - local: model_doc/zamba2
+        title: Zamba2
+      title: Text models
+    - sections:
+      - local: model_doc/beit
+        title: BEiT
+      - local: model_doc/bit
+        title: BiT
+      - local: model_doc/conditional_detr
+        title: Conditional DETR
+      - local: model_doc/convnext
+        title: ConvNeXT
+      - local: model_doc/convnextv2
+        title: ConvNeXTV2
+      - local: model_doc/cvt
+        title: CvT
+      - local: model_doc/dab-detr
+        title: DAB-DETR
+      - local: model_doc/deformable_detr
+        title: Deformable DETR
+      - local: model_doc/deit
+        title: DeiT
+      - local: model_doc/depth_anything
+        title: Depth Anything
+      - local: model_doc/depth_anything_v2
+        title: Depth Anything V2
+      - local: model_doc/depth_pro
+        title: DepthPro
+      - local: model_doc/deta
+        title: DETA
+      - local: model_doc/detr
+        title: DETR
+      - local: model_doc/dinat
+        title: DiNAT
+      - local: model_doc/dinov2
+        title: DINOV2
+      - local: model_doc/dinov2_with_registers
+        title: DINOv2 with Registers
+      - local: model_doc/dit
+        title: DiT
+      - local: model_doc/dpt
+        title: DPT
+      - local: model_doc/efficientformer
+        title: EfficientFormer
+      - local: model_doc/efficientnet
+        title: EfficientNet
+      - local: model_doc/focalnet
+        title: FocalNet
+      - local: model_doc/glpn
+        title: GLPN
+      - local: model_doc/hiera
+        title: Hiera
+      - local: model_doc/ijepa
+        title: I-JEPA
+      - local: model_doc/imagegpt
+        title: ImageGPT
+      - local: model_doc/levit
+        title: LeViT
+      - local: model_doc/mask2former
+        title: Mask2Former
+      - local: model_doc/maskformer
+        title: MaskFormer
+      - local: model_doc/mlcd
+        title: MLCD
+      - local: model_doc/mobilenet_v1
+        title: MobileNetV1
+      - local: model_doc/mobilenet_v2
+        title: MobileNetV2
+      - local: model_doc/mobilevit
+        title: MobileViT
+      - local: model_doc/mobilevitv2
+        title: MobileViTV2
+      - local: model_doc/nat
+        title: NAT
+      - local: model_doc/poolformer
+        title: PoolFormer
+      - local: model_doc/prompt_depth_anything
+        title: Prompt Depth Anything
+      - local: model_doc/pvt
+        title: Pyramid Vision Transformer (PVT)
+      - local: model_doc/pvt_v2
+        title: Pyramid Vision Transformer v2 (PVTv2)
+      - local: model_doc/regnet
+        title: RegNet
+      - local: model_doc/resnet
+        title: ResNet
+      - local: model_doc/rt_detr
+        title: RT-DETR
+      - local: model_doc/rt_detr_v2
+        title: RT-DETRv2
+      - local: model_doc/segformer
+        title: SegFormer
+      - local: model_doc/seggpt
+        title: SegGpt
+      - local: model_doc/superglue
+        title: SuperGlue
+      - local: model_doc/superpoint
+        title: SuperPoint
+      - local: model_doc/swiftformer
+        title: SwiftFormer
+      - local: model_doc/swin
+        title: Swin Transformer
+      - local: model_doc/swinv2
+        title: Swin Transformer V2
+      - local: model_doc/swin2sr
+        title: Swin2SR
+      - local: model_doc/table-transformer
+        title: Table Transformer
+      - local: model_doc/textnet
+        title: TextNet
+      - local: model_doc/timm_wrapper
+        title: Timm Wrapper
+      - local: model_doc/upernet
+        title: UperNet
+      - local: model_doc/van
+        title: VAN
+      - local: model_doc/vit
+        title: Vision Transformer (ViT)
+      - local: model_doc/vit_hybrid
+        title: ViT Hybrid
+      - local: model_doc/vitdet
+        title: ViTDet
+      - local: model_doc/vit_mae
+        title: ViTMAE
+      - local: model_doc/vitmatte
+        title: ViTMatte
+      - local: model_doc/vit_msn
+        title: ViTMSN
+      - local: model_doc/vitpose
+        title: ViTPose
+      - local: model_doc/yolos
+        title: YOLOS
+      - local: model_doc/zoedepth
+        title: ZoeDepth
+      title: Vision models
+    - sections:
+      - local: model_doc/audio-spectrogram-transformer
+        title: Audio Spectrogram Transformer
+      - local: model_doc/bark
+        title: Bark
+      - local: model_doc/clap
+        title: CLAP
+      - local: model_doc/dac
+        title: dac
+      - local: model_doc/encodec
+        title: EnCodec
+      - local: model_doc/fastspeech2_conformer
+        title: FastSpeech2Conformer
+      - local: model_doc/granite_speech
+        title: GraniteSpeech
+      - local: model_doc/hubert
+        title: Hubert
+      - local: model_doc/mctct
+        title: MCTCT
+      - local: model_doc/mimi
+        title: Mimi
+      - local: model_doc/mms
+        title: MMS
+      - local: model_doc/moonshine
+        title: Moonshine
+      - local: model_doc/moshi
+        title: Moshi
+      - local: model_doc/musicgen
+        title: MusicGen
+      - local: model_doc/musicgen_melody
+        title: MusicGen Melody
+      - local: model_doc/pop2piano
+        title: Pop2Piano
+      - local: model_doc/seamless_m4t
+        title: Seamless-M4T
+      - local: model_doc/seamless_m4t_v2
+        title: SeamlessM4T-v2
+      - local: model_doc/sew
+        title: SEW
+      - local: model_doc/sew-d
+        title: SEW-D
+      - local: model_doc/speech_to_text
+        title: Speech2Text
+      - local: model_doc/speech_to_text_2
+        title: Speech2Text2
+      - local: model_doc/speecht5
+        title: SpeechT5
+      - local: model_doc/unispeech
+        title: UniSpeech
+      - local: model_doc/unispeech-sat
+        title: UniSpeech-SAT
+      - local: model_doc/univnet
+        title: UnivNet
+      - local: model_doc/vits
+        title: VITS
+      - local: model_doc/wav2vec2
+        title: Wav2Vec2
+      - local: model_doc/wav2vec2-bert
+        title: Wav2Vec2-BERT
+      - local: model_doc/wav2vec2-conformer
+        title: Wav2Vec2-Conformer
+      - local: model_doc/wav2vec2_phoneme
+        title: Wav2Vec2Phoneme
+      - local: model_doc/wavlm
+        title: WavLM
+      - local: model_doc/whisper
+        title: Whisper
+      - local: model_doc/xls_r
+        title: XLS-R
+      - local: model_doc/xlsr_wav2vec2
+        title: XLSR-Wav2Vec2
+      title: Audio models
+    - sections:
+      - local: model_doc/timesformer
+        title: TimeSformer
+      - local: model_doc/videomae
+        title: VideoMAE
+      - local: model_doc/vivit
+        title: ViViT
+      title: Video models
+    - sections:
+      - local: model_doc/align
+        title: ALIGN
+      - local: model_doc/altclip
+        title: AltCLIP
+      - local: model_doc/aria
+        title: Aria
+      - local: model_doc/aya_vision
+        title: AyaVision
+      - local: model_doc/blip
+        title: BLIP
+      - local: model_doc/blip-2
+        title: BLIP-2
+      - local: model_doc/bridgetower
+        title: BridgeTower
+      - local: model_doc/bros
+        title: BROS
+      - local: model_doc/chameleon
+        title: Chameleon
+      - local: model_doc/chinese_clip
+        title: Chinese-CLIP
+      - local: model_doc/clip
+        title: CLIP
+      - local: model_doc/clipseg
+        title: CLIPSeg
+      - local: model_doc/clvp
+        title: CLVP
+      - local: model_doc/colpali
+        title: ColPali
+      - local: model_doc/data2vec
+        title: Data2Vec
+      - local: model_doc/deplot
+        title: DePlot
+      - local: model_doc/donut
+        title: Donut
+      - local: model_doc/emu3
+        title: Emu3
+      - local: model_doc/flava
+        title: FLAVA
+      - local: model_doc/gemma3
+        title: Gemma3
+      - local: model_doc/git
+        title: GIT
+      - local: model_doc/got_ocr2
+        title: GOT-OCR2
+      - local: model_doc/granitevision
+        title: GraniteVision
+      - local: model_doc/grounding-dino
+        title: Grounding DINO
+      - local: model_doc/groupvit
+        title: GroupViT
+      - local: model_doc/idefics
+        title: IDEFICS
+      - local: model_doc/idefics2
+        title: Idefics2
+      - local: model_doc/idefics3
+        title: Idefics3
+      - local: model_doc/instructblip
+        title: InstructBLIP
+      - local: model_doc/instructblipvideo
+        title: InstructBlipVideo
+      - local: model_doc/internvl
+        title: InternVL
+      - local: model_doc/janus
+        title: Janus
+      - local: model_doc/kosmos-2
+        title: KOSMOS-2
+      - local: model_doc/layoutlm
+        title: LayoutLM
+      - local: model_doc/layoutlmv2
+        title: LayoutLMV2
+      - local: model_doc/layoutlmv3
+        title: LayoutLMV3
+      - local: model_doc/layoutxlm
+        title: LayoutXLM
+      - local: model_doc/lilt
+        title: LiLT
+      - local: model_doc/llama4
+        title: Llama4
+      - local: model_doc/llava
+        title: Llava
+      - local: model_doc/llava_next
+        title: LLaVA-NeXT
+      - local: model_doc/llava_next_video
+        title: LLaVa-NeXT-Video
+      - local: model_doc/llava_onevision
+        title: LLaVA-Onevision
+      - local: model_doc/lxmert
+        title: LXMERT
+      - local: model_doc/matcha
+        title: MatCha
+      - local: model_doc/mgp-str
+        title: MGP-STR
+      - local: model_doc/mistral3
+        title: Mistral3
+      - local: model_doc/mllama
+        title: mllama
+      - local: model_doc/nougat
+        title: Nougat
+      - local: model_doc/omdet-turbo
+        title: OmDet-Turbo
+      - local: model_doc/oneformer
+        title: OneFormer
+      - local: model_doc/owlvit
+        title: OWL-ViT
+      - local: model_doc/owlv2
+        title: OWLv2
+      - local: model_doc/paligemma
+        title: PaliGemma
+      - local: model_doc/perceiver
+        title: Perceiver
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
+      - local: model_doc/pix2struct
+        title: Pix2Struct
+      - local: model_doc/pixtral
+        title: Pixtral
+      - local: model_doc/qwen2_5_omni
+        title: Qwen2.5-Omni
+      - local: model_doc/qwen2_5_vl
+        title: Qwen2.5-VL
+      - local: model_doc/qwen2_audio
+        title: Qwen2Audio
+      - local: model_doc/qwen2_vl
+        title: Qwen2VL
+      - local: model_doc/sam
+        title: Segment Anything
+      - local: model_doc/shieldgemma2
+        title: ShieldGemma2
+      - local: model_doc/siglip
+        title: SigLIP
+      - local: model_doc/siglip2
+        title: SigLIP2
+      - local: model_doc/smolvlm
+        title: SmolVLM
+      - local: model_doc/speech-encoder-decoder
+        title: Speech Encoder Decoder Models
+      - local: model_doc/tapas
+        title: TAPAS
+      - local: model_doc/trocr
+        title: TrOCR
+      - local: model_doc/tvlt
+        title: TVLT
+      - local: model_doc/tvp
+        title: TVP
+      - local: model_doc/udop
+        title: UDOP
+      - local: model_doc/video_llava
+        title: VideoLlava
+      - local: model_doc/vilt
+        title: ViLT
+      - local: model_doc/vipllava
+        title: VipLlava
+      - local: model_doc/vision-encoder-decoder
+        title: Vision Encoder Decoder Models
+      - local: model_doc/vision-text-dual-encoder
+        title: Vision Text Dual Encoder
+      - local: model_doc/visual_bert
+        title: VisualBERT
+      - local: model_doc/xclip
+        title: X-CLIP
+      title: Multimodal models
+    - sections:
+      - local: model_doc/decision_transformer
+        title: Decision Transformer
+      - local: model_doc/trajectory_transformer
+        title: Trajectory Transformer
+      title: Reinforcement learning models
+    - sections:
+      - local: model_doc/autoformer
+        title: Autoformer
+      - local: model_doc/informer
+        title: Informer
+      - local: model_doc/patchtsmixer
+        title: PatchTSMixer
+      - local: model_doc/patchtst
+        title: PatchTST
+      - local: model_doc/time_series_transformer
+        title: Time Series Transformer
+      - local: model_doc/timesfm
+        title: TimesFM
+      title: Time series models
+    - sections:
+      - local: model_doc/graphormer
+        title: Graphormer
+      title: Graph models
+    title: Models
+  - sections:
+    - local: internal/modeling_utils
+      title: Custom Layers and Utilities
+    - local: internal/model_debugging_utils
+      title: Utilities for Model Debugging
+    - local: internal/pipelines_utils
+      title: Utilities for pipelines
+    - local: internal/tokenization_utils
+      title: Utilities for Tokenizers
+    - local: internal/trainer_utils
+      title: Utilities for Trainer
+    - local: internal/generation_utils
+      title: Utilities for Generation
+    - local: internal/image_processing_utils
+      title: Utilities for Image Processors
+    - local: internal/audio_utils
+      title: Utilities for Audio processing
+    - local: internal/file_utils
+      title: General Utilities
+    - local: internal/import_utils
+      title: Importing Utilities
+    - local: internal/time_series_utils
+      title: Utilities for Time Series
+    title: Internal helpers
+  title: API
diff --git a/docs/transformers/docs/source/en/accelerate.md b/docs/transformers/docs/source/en/accelerate.md
new file mode 100644
index 0000000000000000000000000000000000000000..c0ad46f8ac91fba157953fcba5143e437681d196
--- /dev/null
+++ b/docs/transformers/docs/source/en/accelerate.md
@@ -0,0 +1,165 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Accelerate
+
+[Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training.
+
+This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index).
+
+```bash
+pip install accelerate
+```
+
+Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup.
+
+```bash
+accelerate config
+```
+
+Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following.
+
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_transformer_layer_cls_to_wrap: BertLayer
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+## Trainer
+
+Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`].
+
+```py
+from transformers import TrainingArguments, Trainer
+
+training_args = TrainingArguments(
+    output_dir="your-model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=2,
+    fsdp_config="path/to/fsdp_config",
+    fsdp_strategy="full_shard",
+    weight_decay=0.01,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    push_to_hub=True,
+)
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    processing_class=tokenizer,
+    data_collator=data_collator,
+    compute_metrics=compute_metrics,
+)
+
+trainer.train()
+```
+
+## Native PyTorch
+
+Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to.
+
+```py
+from accelerate import Accelerator
+
+accelerator = Accelerator()
+device = accelerator.device
+```
+
+All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader.
+
+```py
+train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
+    train_dataloader, eval_dataloader, model, optimizer
+)
+```
+
+Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron).
+
+```py
+for epoch in range(num_epochs):
+    for batch in train_dataloader:
+        outputs = model(**batch)
+        loss = outputs.loss
+        accelerator.backward(loss)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+        progress_bar.update(1)
+```
+
+Combine everything into a function and make it callable as a script.
+
+```py
+from accelerate import Accelerator
+  
+def main():
+  accelerator = Accelerator()
+
+  model, optimizer, training_dataloader, scheduler = accelerator.prepare(
+      model, optimizer, training_dataloader, scheduler
+  )
+
+  for batch in training_dataloader:
+      optimizer.zero_grad()
+      inputs, targets = batch
+      outputs = model(inputs)
+      loss = loss_function(outputs, targets)
+      accelerator.backward(loss)
+      optimizer.step()
+      scheduler.step()
+
+if __name__ == "__main__":
+    main()
+```
+
+From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well.
+
+To launch your training script on two GPUs, add the `--num_processes` argument.
+
+```bash
+accelerate launch --num_processes=2 your_script.py
+```
+
+Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details.
diff --git a/docs/transformers/docs/source/en/add_new_model.md b/docs/transformers/docs/source/en/add_new_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..419b1dced4126f47ad40f84ff0569cd77cb6630c
--- /dev/null
+++ b/docs/transformers/docs/source/en/add_new_model.md
@@ -0,0 +1,665 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Adding a new model to Transformers
+
+> [!TIP]
+> Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
+
+Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models.
+
+When you add a model to Transformers, you'll learn:
+
+- more about open-source best practices
+- about a models architecture
+- about Transformers' design principles
+- how to efficiently test large models
+- how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code
+
+It is a challenging but rewarding process.
+
+This guide will walk you through adding an example BrandNewLlama PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library.
+
+## Transformers overview
+
+Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us sustainably scale and maintain Transformers.
+
+> [!TIP]
+> Learn more about our design principles on the [Philosophy](./philosophy) doc.
+
+Some of these design choices are:
+
+- composition > over-abstraction
+- duplicate code isn't always bad if it greatly improves readability and accessibility
+- model files are self-contained and all the necessary model code is found in the `modeling_mymodel.py` file
+
+These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify.
+
+This section describes how the model and configuration classes interact and the Transformers code style.
+
+### Model and configuration
+
+All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint.
+
+There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewLlama, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods.
+
+Other important functions like the forward method are defined in the `modeling.py` file.
+
+Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inheriting from it to keep abstraction low.
+
+New models require a configuration, for example `BrandNewLlamaConfig`, that is stored as an attribute of [`PreTrainedModel`].
+
+```py
+model = BrandNewLlamaModel.from_pretrained("username/brand_new_llama")
+model.config
+```
+
+[`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods.
+
+When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together.
+
+A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file.
+
+### Code style
+
+Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include:
+
+- The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop.
+
+- Explicit code is preferred - even if it's longer - over shorter code.
+
+- Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints.
+
+- Function signatures should be type-annotated. Otherwise, use good variable names so they're more understandable.
+
+## New model addition issue
+
+Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model.
+
+> [!TIP]
+> Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests.
+
+Now is a good time to get familiar with BrandNewLlama. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading.
+
+- What type of model is BrandNewLlama? Is it a encoder, decoder, or encoder-decoder model?
+- What tasks can BrandNewLlama be used for?
+- What makes BrandNewLlama different from other models?
+- What models in Transformers are most similar to BrandNewLlama?
+- What tokenizer does BrandNewLlama use?
+
+In addition to learning more about your model, use the tips below to help you add a model faster.
+
+> [!TIP]
+> Each contributor has a unique style and workflow for adding models to Transformers. For an example, take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added.
+
+- Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this.
+- This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model.
+- Don't be shy to ask for help! We are here to support you. 🤗
+
+## Dev environment
+
+Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy to work on. Clone the repository to your local disk and add the base repository as the remote.
+
+```bash
+git clone https://github.com/[your Github handle]/transformers.git
+cd transformers
+git remote add upstream https://github.com/huggingface/transformers.git
+```
+
+Create a virtual environment and perform an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies.
+
+```bash
+python -m venv .env
+source .env/bin/activate
+pip install -e ".[dev]"
+```
+
+Due to the number of optional dependencies as Transformers grows, this command may fail. In this case, install the "quality" dependencies. Also make sure you have a deep learning framework installed.
+
+```bash
+pip install -e ".[quality]"
+```
+
+Return to the parent directory and clone and install the original BrandNewLlama repository.
+
+```bash
+git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git
+cd brand_new_bert
+pip install -e .
+```
+
+Return to your clone of Transformers to begin porting BrandNewLlama.
+
+```bash
+cd transformers
+```
+
+There are two possible debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script.
+
+> [!WARNING]
+> We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU.
+
+Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. You can also share notebooks when working with other contributors.
+
+The downside is that if you aren't used to them, it may take some time to get used to.
+
+> [!TIP]
+> If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model.
+
+Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.
+
+```bash
+transformers-cli add-new-model-like
+```
+
+## Create a pull request
+
+Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewLlama** so it's clear that this is a work in progress.
+
+Create a branch with a descriptive name from your main branch.
+
+```bash
+git checkout -b add_brand_new_bert
+```
+
+Commit the code, and then fetch and rebase on the main branch.
+
+```bash
+git add .
+git commit
+git fetch upstream
+git rebase upstream/main
+```
+
+Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to indicate it's a work in progress.
+
+```bash
+git push -u origin a-descriptive-name-for-my-changes
+```
+
+Include relevant Hugging Face team members by adding their GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean.
+
+Remember to periodically commit and push your work, and update your work with the current main branch.
+
+```bash
+git fetch upstream
+git merge upstream/main
+```
+
+## Original checkpoint
+
+Take some time to work on the original model implementation first to understand how it works.
+
+This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone!
+
+Orient yourself with the original repository by doing the following.
+
+- Locate the pretrained weights.
+- Figure out how to the load pretrained weights into the model.
+- Figure out how to run the tokenizer independently of the model.
+- Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement.
+- Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model.
+- Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/).
+
+The last point is especially important because you'll need a thorough understanding of what's happening inside the original model before you can reimplement it in Transformers. Feel free to open issues and pull requests in the original repository if you encounter any issues.
+
+A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following.
+
+```py
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
+original_output = model.generate(input_ids)
+```
+
+### Debugging
+
+If you run into issues, you'll need to choose one of the following debugging strategies depending on the original models codebase.
+
+<hfoptions id="debug-strategy">
+<hfoption id="sub-components">
+
+This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach.
+
+1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in the Transformers' implementation. This is better than relying on a visual comparison based on print statements.
+2. It is easier to port individual components instead of the entire model.
+3. It is easier for understanding how a model works by breaking it up into smaller parts.
+4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests.
+
+> [!TIP]
+> Refer to the ELECTRA [integration checks](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) for a good example of how to decompose a model into smaller components.
+
+</hfoption>
+<hfoption id="model and tokenizer">
+
+This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components.
+
+For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to rely on verifying print statements.
+
+</hfoption>
+</hfoptions>
+
+Whichever strategy you choose, it is recommended to debug the initial layers first and the final layers last. Retrieve the output, either with print statements or sub-component functions, of the following layers in this order.
+
+1. input ids passed to the model
+2. word embeddings
+3. input of the first Transformer layer
+4. output of the first Transformer layer
+5. output of the following n-1 Transformer layers
+6. output of the whole model
+
+The input ids should just be an array of integers like `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`.
+
+Layer outputs often consist of multi-dimensional float arrays.
+
+```py
+[[
+ [-0.1465, -0.6501,  0.1993,  ...,  0.1451,  0.3430,  0.6024],
+ [-0.4417, -0.5920,  0.3450,  ..., -0.3062,  0.6182,  0.7132],
+ [-0.5009, -0.7122,  0.4548,  ..., -0.3662,  0.6091,  0.7648],
+ ...,
+ [-0.5613, -0.6332,  0.4324,  ..., -0.3792,  0.7372,  0.9288],
+ [-0.5416, -0.6345,  0.4180,  ..., -0.3564,  0.6992,  0.9191],
+ [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
+```
+
+Every Transformers model output should have a precision or error tolerance of *1e-3*. This accounts for any output differences that arise from using a different library framework. Compare the intermediate outputs of the original model with the Transformers implementation to ensure they're nearly identical. Having an *efficient* debugging environment is crucial for this step.
+
+Here are some tips for an efficient debugging environment.
+
+- To debug intermediate results, it depends on the machine learning framework the original model repository is using. For PyTorch, you should write a script to decompose the original model into smaller sub-components to retrieve the intermediate values. For TensorFlow, you may need to use [tf.print](https://www.tensorflow.org/api_docs/python/tf/print). For Flax, make sure the model is *not jitted* during the forward pass (refer to this GitHub [Issue](https://github.com/google/jax/issues/196) for more details).
+
+- It is faster to debug with a smaller pretrained checkpoint versus a larger checkpoint where the forward pass takes more than 10 seconds. If only large checkpoints are available, create a dummy model with randomly initialized weights and save those weights to compare against the Transformers implementation.
+
+- Find the easiest way to call the model's forward pass. Ideally, this function (may be called `predict`, `evaluate`, `forward`, or `__call__`) should only call the forward pass *once*. It is more difficult to debug a function that calls the forward pass multiple times.
+
+- Separate tokenization from the forward pass. Locate where a string input is changed to input ids in the forward pass and start here. You may need to create a small script or modify the original code to directly input the input ids instead of an input string.
+
+- Ensure the model is *not* in training mode. This can produce random outputs due to multiple dropout layers in a model. The forward pass in your debugging environment should be *deterministic* so that the dropout layers aren't used.
+
+Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers.
+
+## Adapt the model code
+
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.
+
+- `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
+- `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
+
+The automatically generated code in the `modeling.py` file has the same architecture as Llama if you answered it's a decoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on.
+
+### Model initialization
+
+At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works.
+
+```py
+from transformers import BrandNewLlama, BrandNewLlamaConfig
+model = BrandNewLlama(BrandNewLlamaConfig())
+```
+
+Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables.
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.padding_idx is not None:
+            module.weight.data[module.padding_idx].zero_()
+    elif isinstance(module, nn.LayerNorm):
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
+```
+
+The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
+
+The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overridden later. The `_init_weights` function won't be applied to these modules.
+
+```py
+def _init_weights(self, module):
+    """Initialize the weights"""
+    if isinstance(module, Wav2Vec2ForPreTraining):
+        module.project_hid.reset_parameters()
+        module.project_q.reset_parameters()
+        module.project_hid._is_hf_initialized = True
+        module.project_q._is_hf_initialized = True
+    elif isinstance(module, nn.Linear):
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if module.bias is not None:
+            module.bias.data.zero_()
+```
+
+### Convert checkpoints to Transformers
+
+The original checkpoint must be converted to a Transformers compatible checkpoint.
+
+> [!TIP]
+> Try looking for an existing conversion script to copy, adapt, and reuse for your model!
+>
+> - If you're porting a model from TensorFlow to PyTorch, a good starting point may be the BERT [conversion script](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91).
+> - If you're porting a model from PyTorch to PyTorch, a good starting point may be the BART [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py).
+
+Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly.
+
+You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewLlamaConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights.
+
+Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file.
+
+```py
+model.save_pretrained("/path/to/converted/checkpoint/folder")
+```
+
+To help with conversion, the next section briefly describes how PyTorch models stores and defines layer weights and names.
+
+#### PyTorch layer weights and names
+
+It is helpful to create a basic PyTorch model to understand how layer names are defined and weights are initialized.
+
+```py
+from torch import nn
+
+class SimpleModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
+```
+
+PyTorch layer names are defined by the class attribute name of the layer (`dense`, `intermediate`, `layer_norm`). Create a instance of `SimpleModel` to fill all the layers with random weights.
+
+```py
+model = SimpleModel()
+print(model)
+SimpleModel(
+  (dense): Linear(in_features=10, out_features=10, bias=True)
+  (intermediate): Linear(in_features=10, out_features=10, bias=True)
+  (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
+)
+```
+
+The weight values of a specific layer are randomly initialized.
+
+```py
+print(model.dense.weight.data)
+tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
+         -0.2077,  0.2157],
+        [ 0.1044,  0.0201,  0.0990,  0.2482,  0.3116,  0.2509,  0.2866, -0.2190,
+          0.2166, -0.0212],
+        [-0.2000,  0.1107, -0.1999, -0.3119,  0.1559,  0.0993,  0.1776, -0.1950,
+         -0.1023, -0.0447],
+        [-0.0888, -0.1092,  0.2281,  0.0336,  0.1817, -0.0115,  0.2096,  0.1415,
+         -0.1876, -0.2467],
+        [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
+          0.2577,  0.0402],
+        [ 0.1502,  0.2465,  0.2566,  0.0693,  0.2352, -0.0530,  0.1859, -0.0604,
+          0.2132,  0.1680],
+        [ 0.1733, -0.2407, -0.1721,  0.1484,  0.0358, -0.0633, -0.0721, -0.0090,
+          0.2707, -0.2509],
+        [-0.1173,  0.1561,  0.2945,  0.0595, -0.1996,  0.2988, -0.0802,  0.0407,
+          0.1829, -0.1568],
+        [-0.1164, -0.2228, -0.0403,  0.0428,  0.1339,  0.0047,  0.1967,  0.2923,
+          0.0333, -0.0536],
+        [-0.1492, -0.1616,  0.1057,  0.1950, -0.2807, -0.2710, -0.1586,  0.0739,
+          0.2220,  0.2358]]).
+```
+
+In the conversion script, the random weights should be replaced with the exact weights from the corresponding layer in the original checkpoint.
+
+```py
+# retrieve matching layer weights with recursive algorithm
+layer_name = "dense"
+pretrained_weight = array_of_dense_layer
+
+model_pointer = getattr(model, "dense")
+model_pointer.weight.data = torch.from_numpy(pretrained_weight)
+```
+
+Verify the randomly initialized weights and their corresponding pretrained checkpoint weights have the identical **shape** and **name**. Add assert statements for the shape and print out the checkpoint weight names.
+
+```py
+assert (
+    model_pointer.weight.shape == pretrained_weight.shape
+), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
+
+logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
+```
+
+When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewLlama` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first.
+
+### Implement the forward pass
+
+The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output.
+
+```py
+model = BrandNewLlamaModel.from_pretrained("/path/to/converted/checkpoint/folder")
+input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
+output = model.generate(input_ids).last_hidden_states
+```
+
+Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)).
+
+Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include:
+
+- Some layers were not added (activation layer or a residual connection).
+- The word embedding matrix is not tied.
+- The wrong positional embeddings are used because the original implementation includes an offset.
+- Dropout is applied during the forward pass. Fix this error by making sure `model.training` is `False` and passing `self.training` to [torch.nn.functional.dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout).
+
+Compare the forward pass of the original model and your implementation to check if there are any differences. Ideally, debug and print out the intermediate outputs of both implementations of the forward pass to pinpoint where the original implementation differs from yours.
+
+1. Make sure the hardcoded `input_ids` in both implementations are identical.
+2. Verify the outputs of the first transformation of `input_ids` (usually the word embeddings) are identical, and work your way through to the last layer.
+
+Any difference between the two implementations should point to the bug in your implementation.
+
+One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs.
+
+When both implementations produce the same output, verify the outputs are within a precision of *1e-3*.
+
+```py
+torch.allclose(original_output, output, atol=1e-3)
+```
+
+This is typically the most difficult part of the process. Congratulations if you've made it this far!
+
+And if you're stuck or struggling with this step, don't hesitate to ask for help on your pull request.
+
+### Add model tests
+
+While the model works, you still need to add tests to ensure it is compatible with Transformers. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made.
+
+[Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass.
+
+```bash
+pytest tests/models/brand_new_llama/test_modeling_brand_new_llama.py
+```
+
+The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewLlamaModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command.
+
+<hfoptions id="integration-test">
+<hfoption id="macOS">
+
+```bash
+RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```bash
+SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
+```
+
+</hfoption>
+</hfoptions>
+
+All features unique to BrandNewLlama should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because:
+
+- it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work
+- future contributors can quickly test changes to the model by running these special tests
+
+## Implement tokenizer
+
+> [!TIP]
+> We recommend adding a fast tokenizer ([`PreTrainedTokenizerFast`]) to give users the best performance. Feel free to tag [@ArthurZucker](https://github.com/ArthurZucker) or [@itazap](https://github.com/itazap) in your PR for help on how to add [`PreTrainedTokenizerFast`].
+
+With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers.
+
+Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below.
+
+```py
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
+input_ids = model.tokenize(input_str)
+```
+
+You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following.
+
+```py
+from transformers import BrandNewLlamaTokenizer
+
+input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
+tokenizer = BrandNewLlamaTokenizer.from_pretrained("/path/to/tokenizer/folder/")
+input_ids = tokenizer(input_str).input_ids
+```
+
+When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests.
+
+## Implement image processor
+
+> [!TIP]
+> Fast image processors use the [torchvision](https://pytorch.org/vision/stable/index.html) library and can perform image processing on the GPU, significantly improving processing speed.
+> We recommend adding a fast image processor ([`BaseImageProcessorFast`]) in addition to the "slow" image processor ([`BaseImageProcessor`]) to provide users with the best performance. Feel free to tag [@yonigozlan](https://github.com/yonigozlan) for help adding a [`BaseImageProcessorFast`].
+
+While this example doesn't include an image processor, you may need to implement one if your model requires image inputs. The image processor is responsible for converting images into a format suitable for your model. Before implementing a new one, check whether an existing image processor in the Transformers library can be reused, as many models share similar image processing techniques. Note that you can also use [modular](./modular_transformers) for image processors to reuse existing components.
+
+If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.
+
+Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+
+```bash
+transformers-cli add-fast-image-processor --model-name your_model_name
+```
+
+This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
+
+Add tests for the image processor in `tests/models/your_model_name/test_image_processing_your_model_name.py`. These tests should be similar to those for other image processors and should verify that the image processor correctly handles image inputs. If your image processor includes unique features or processing methods, ensure you add specific tests for those as well.
+
+## Implement processor
+
+If your model accepts multiple modalities, like text and images, you need to add a processor. The processor centralizes the preprocessing of different modalities before passing them to the model.
+
+The processor should call the appropriate modality-specific processors within its `__call__` function to handle each type of input correctly. Be sure to check existing processors in the library to understand their expected structure. Transformers uses the following convention in the `__call__` function signature.
+
+```python
+def __call__(
+    self,
+    images: ImageInput = None,
+    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+    audio=None,
+    videos=None,
+    **kwargs: Unpack[YourModelProcessorKwargs],
+) -> BatchFeature:
+    ...
+```
+
+`YourModelProcessorKwargs` is a `TypedDict` that includes all the typical processing arguments and any extra arguments a specific processor may require.
+
+Add tests for the processor in `tests/models/your_model_name/test_processor_your_model_name.py`. These tests should be similar to those for other processors and should verify that the processor correctly handles the different modalities.
+
+## Integration tests
+
+Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_llama/test_modeling_brand_new_llama.py`.
+
+The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair.
+
+If the checkpoint hasn't been fine-tuned on a downstream task, then the model tests are sufficient.
+
+Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. If you don't have access to a GPU, we can take care of that for you.
+
+## Add documentation
+
+Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_llama.md`, that you can fill out with information about your model.
+
+This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used.
+
+Make sure docstrings are added to `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings.
+
+## Refactor
+
+Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles.
+
+```bash
+make style
+```
+
+To verify the code style passes quality checks, run the command below.
+
+```bash
+make quality
+```
+
+There may be other failing tests or checks (missing docstring or incorrect naming) on your pull request due to Transformers strict design tests. We can help you with these issues if you're stuck.
+
+After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner.
+
+## Upload to the Hub
+
+Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it.
+
+> [!TIP]
+> In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or fine-tune it on a downstream task. While not required, including a notebook can drive greater adoption of your model.
+
+You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model.
+
+Use the [`~PreTrainedModel.push_to_hub`] method to upload the model.
+
+```py
+brand_new_bert.push_to_hub("brand_new_llama")
+```
+
+Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub.
+
+## Merge your model
+
+You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed.
+
+Congratulations on adding a new model to Transformers! 🥳
+
+This is a very significant contribution. Your work makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community!
+
+## Model addition timeline
+
+There are four timelines for model additions depending on the model contributor and community demand for an architecture.
+
+- **day-0 integration**: If you plan on having a Transformers-first release, this is a great option because we can ensure the documentation is clear and optimize your model as much as possible (quantization, FlashAttention, KV-cache, etc.). We can also help you add the model, provide early reviews and make sure it works as expected.
+
+  Reach out to transformers@huggingface.co a few days (preferably weeks) in advance, especially if an architecture is particularly novel, to ensure model integration. We'll work together on a private fork of Transformers until your checkpoint and release is ready.
+
+- **same week integration**: Models with significant requests/demand are usually added the same week if the model author doesn't reach out.
+
+  Use the [issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) to request a specific model to add. The more activity on the issue, the faster and more likely we'll integrate it.
+
+- **post-release integration**: Models without popular requests/demand or if we don't have the bandwidth to integrate it are added post-release.
+
+  This is a good opportunity if you're interested in contributing a model to Transformers. Take a look at open issues tagged with ["New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). Feel free to give the most requested models a try first to multiply the impact of your contribution. We'll be there to help you each step of the way!
+
+- **Hub-first release**: Transformers [remote-code](./models#custom-models) feature allows Transformers-based projects to be shared directly on the Hub. This is a good option if you don't have the bandwidth to add a model directly to Transformers.
+
+  If a model ends up being very popular, then it's very likely that we'll integrate it in Transformers ourselves to enable better support (documentation, maintenance, optimization, etc.) for it. A Hub-first release is the most frictionless way to add a model.
diff --git a/docs/transformers/docs/source/en/add_new_pipeline.md b/docs/transformers/docs/source/en/add_new_pipeline.md
new file mode 100644
index 0000000000000000000000000000000000000000..60ef43dab585032a01f6260dd2b748af732ee410
--- /dev/null
+++ b/docs/transformers/docs/source/en/add_new_pipeline.md
@@ -0,0 +1,229 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Adding a new pipeline
+
+Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it.
+
+This guide will walk you through the process of adding a new pipeline to Transformers.
+
+## Design choices
+
+At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline.
+
+Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with.
+
+Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data.
+
+## Create a pipeline
+
+With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods.
+
+```py
+from transformers import Pipeline
+
+class MyPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+
+    def preprocess(self, inputs, args=2):
+
+    def _forward(self, model_inputs):
+
+    def postprocess(self, model_outputs):
+```
+
+1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model.
+
+```py
+def preprocess(self, inputs, maybe_arg=2):
+    model_input = Tensor(inputs["input_ids"])
+    return {"model_input": model_input}
+```
+
+2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`.
+
+```py
+def _forward(self, model_inputs):
+    outputs = self.model(**model_inputs)
+    return outputs
+```
+
+3. `postprocess` generates the final output from the models output in `_forward`.
+
+```py
+def postprocess(self, model_outputs, top_k=5):
+    best_class = model_outputs["logits"].softmax(-1)
+    return best_class
+```
+
+4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural.
+
+For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`.
+
+```py
+def _sanitize_parameters(self, **kwargs):
+    preprocess_kwargs = {}
+    if "maybe_arg" in kwargs:
+        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+    postprocess_kwargs = {}
+    if "top_k" in kwargs:
+        postprocess_kwargs["top_k"] = kwargs["top_k"]
+    return preprocess_kwargs, {}, postprocess_kwargs
+```
+
+Now the pipeline can return the top most likely labels if a user chooses to.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline("my-task")
+# returns 3 most likely labels
+pipeline("This is the best meal I've ever had", top_k=3)
+# returns 5 most likely labels by default
+pipeline("This is the best meal I've ever had")
+```
+
+## Register a pipeline
+
+Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
+
+- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
+- a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
+- the expected input with `type`
+
+```py
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "new-task",
+    pipeline_class=MyPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+    default={"pt": ("user/awesome-model", "branch-name")},
+    type="text",
+)
+```
+
+## Share your pipeline
+
+Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers.
+
+It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works.
+
+### Upload to the Hub
+
+Add your pipeline code to the Hub in a Python file.
+
+For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
+
+```py
+import numpy as np
+from transformers import Pipeline
+
+def softmax(outputs):
+    maxes = np.max(outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+class PairClassificationPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "second_text" in kwargs:
+            preprocess_kwargs["second_text"] = kwargs["second_text"]
+        return preprocess_kwargs, {}, {}
+
+    def preprocess(self, text, second_text=None):
+        return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
+
+    def _forward(self, model_inputs):
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs):
+        logits = model_outputs.logits[0].numpy()
+        probabilities = softmax(logits)
+
+        best_class = np.argmax(probabilities)
+        label = self.model.config.id2label[best_class]
+        score = probabilities[best_class].item()
+        logits = logits.tolist()
+        return {"label": label, "score": score, "logits": logits}
+```
+
+Save the code in a file named `pair_classification.py`, and import and register it as shown below.
+
+```py
+from pair_classification import PairClassificationPipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+
+PIPELINE_REGISTRY.register_pipeline(
+    "pair-classification",
+    pipeline_class=PairClassificationPipeline,
+    pt_model=AutoModelForSequenceClassification,
+    tf_model=TFAutoModelForSequenceClassification,
+)
+```
+
+The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file.
+
+```json
+  "custom_pipelines": {
+    "pair-classification": {
+      "impl": "pair_classification.PairClassificationPipeline",
+      "pt": [
+        "AutoModelForSequenceClassification"
+      ],
+      "tf": [
+        "TFAutoModelForSequenceClassification"
+      ],
+    }
+  },
+```
+
+Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc")
+pipeline.push_to_hub("pair-classification-pipeline")
+```
+
+To use the pipeline, add `trust_remote_code=True` when loading the pipeline.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline(task="pair-classification", trust_remote_code=True)
+```
+
+### Add to Transformers
+
+Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team.
+
+Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py).
+
+Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
+
+The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
+
+You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
+
+Finally, you should also implement the following 4 tests.
+
+1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
+1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
diff --git a/docs/transformers/docs/source/en/agents.md b/docs/transformers/docs/source/en/agents.md
new file mode 100644
index 0000000000000000000000000000000000000000..af2e9df1e8c96afaa4daa92899ea8eeb5368967d
--- /dev/null
+++ b/docs/transformers/docs/source/en/agents.md
@@ -0,0 +1,18 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+> [!WARNING]
+> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
diff --git a/docs/transformers/docs/source/en/attention.md b/docs/transformers/docs/source/en/attention.md
new file mode 100644
index 0000000000000000000000000000000000000000..02e4db58f5bea0c144f51e29eaf5cab77789ce11
--- /dev/null
+++ b/docs/transformers/docs/source/en/attention.md
@@ -0,0 +1,61 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Attention mechanisms
+
+Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
+computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
+use a sparse version of the attention matrix to speed up training.
+
+## LSH attention
+
+[Reformer](model_doc/reformer) uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
+the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
+modified to mask the current token (except at the first position), because it will give a query and a key equal (so
+very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
+(determined by a n_rounds parameter) and then are averaged together.
+
+## Local attention
+
+[Longformer](model_doc/longformer) uses local attention: often, the local context (e.g., what are the two tokens to the
+left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
+window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
+representation of the whole sentence.
+
+Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
+all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
+their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
+
+<div class="flex justify-center">
+    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
+</div>
+
+Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
+length.
+
+## Other tricks
+
+### Axial positional encodings
+
+[Reformer](model_doc/reformer) uses axial positional encodings: in traditional transformer models, the positional encoding
+E is a matrix of size \\(l\\) by \\(d\\), \\(l\\) being the sequence length and \\(d\\) the dimension of the
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
+that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
+dimensions \\(l_{1} \times d_{1}\\) and \\(l_{2} \times d_{2}\\), such that \\(l_{1} \times l_{2} = l\\) and
+\\(d_{1} + d_{2} = d\\) (with the product for the lengths, this ends up being way smaller). The embedding for time
+step \\(j\\) in E is obtained by concatenating the embeddings for timestep \\(j \% l1\\) in E1 and \\(j // l1\\)
+in E2.
diff --git a/docs/transformers/docs/source/en/attention_interface.md b/docs/transformers/docs/source/en/attention_interface.md
new file mode 100644
index 0000000000000000000000000000000000000000..054a0e471314bfb35becab7ec4e5c322572f8b61
--- /dev/null
+++ b/docs/transformers/docs/source/en/attention_interface.md
@@ -0,0 +1,128 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Attention Interface
+
+This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
+supported models.
+
+## Customizing attention function
+
+Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
+By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
+[`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
+as well as `eager`, which is a simple matrix multiplication without any optimization on top.  
+This is the setting you can usually choose when instantiating a model:
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+# Here, using flash attention as an example
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
+```
+
+But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
+a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+def my_new_sdpa(*args, **kwargs):
+    print("I just entered the attention computation")
+    return sdpa_attention_forward(*args, **kwargs)
+
+AttentionInterface.register("my_new_sdpa", my_new_sdpa)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
+# Try running the forward with the new attention function
+model(torch.ones(1, 5, dtype=int))
+```
+
+You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
+
+## Dynamically switching attention function
+
+You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
+
+```python
+# Back to use original sdpa implementation
+model.config._attn_implementation = "sdpa"
+
+model(torch.ones(1, 5, dtype=int))
+```
+
+and it will stop printing the statements, as it now uses the `sdpa` attention.  
+This allows to quickly change an attention function, without needing to reload the model!
+
+## What about new args needed in my custom attention function?
+
+But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
+`AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
+you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
+
+```python
+from transformers import AutoModelForCausalLM, AttentionInterface
+from transformers.integrations.sdpa_attention import sdpa_attention_forward
+import torch
+
+def custom_attention(
+    module: torch.nn.Module,  # required arg
+    query: torch.Tensor,  # required arg
+    key: torch.Tensor,  # required arg
+    value: torch.Tensor,  # required arg
+    attention_mask: Optional[torch.Tensor],  # required arg
+    a_new_kwargs = None,  # You can now add as many kwargs as you need
+    another_new_kwargs = None,  # You can now add as many kwargs as you need
+    **kwargs,  # You need to accept **kwargs as models will pass other args
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
+    ...  # do your magic!
+    return attn_output, attn_weights  # attn_weights are optional here
+
+AttentionInterface.register("custom", custom_attention)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
+# Forward pass with the new kwargs
+model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
+```
+
+If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
+
+## Accessing current available implementations
+
+Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+would expect from a usual Python dictionary:
+
+```python
+>>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+>>> list(ALL_ATTENTION_FUNCTIONS.keys())
+>>> ['flash_attention_2', 'flex_attention', 'sdpa']
+
+>>> ALL_ATTENTION_FUNCTIONS["sdpa"]
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+>>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
+>>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
+
+# You can also globally `register` a new function directly on it
+>>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
+```
\ No newline at end of file
diff --git a/docs/transformers/docs/source/en/backbones.md b/docs/transformers/docs/source/en/backbones.md
new file mode 100644
index 0000000000000000000000000000000000000000..792b0b0d38f12f11c0f2e6519d90ce56fc704f8a
--- /dev/null
+++ b/docs/transformers/docs/source/en/backbones.md
@@ -0,0 +1,155 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Backbones
+
+Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png"/>
+</div>
+
+Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from.
+
+```py
+from transformers import AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them.
+
+## Backbone classes
+
+There are two backbone classes.
+
+- [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices.
+- [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration.
+
+Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone.
+
+There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class.
+
+<hfoptions id="backbone-classes">
+<hfoption id="AutoBackbone">
+
+The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported.
+
+Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer.
+
+When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"/>
+</div>
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+```
+
+</hfoption>
+<hfoption id="model-specific backbone">
+
+When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task.
+
+The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head.
+
+Set `backbone` to a pretrained model and  `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+# instantiate backbone configuration
+backbone_config = ResNetConfig()
+# load backbone in model
+config = MaskFormerConfig(backbone_config=backbone_config)
+# attach backbone to model head
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+</hfoption>
+</hfoptions>
+
+## timm backbones
+
+[timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes.
+
+Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone.
+
+```py
+from transformers import TimmBackboneConfig
+
+backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True)
+```
+
+Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
+
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+## Feature extraction
+
+The backbone is used to extract image features. Pass an image through the backbone to get the feature maps.
+
+Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer.
+
+```py
+from transformers import AutoImageProcessor, AutoBackbone
+import torch
+from PIL import Image
+import requests
+
+model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
+processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(image, return_tensors="pt")
+outputs = model(**inputs)
+```
+
+The features are stored and accessed from the outputs `feature_maps` attribute.
+
+```py
+feature_maps = outputs.feature_maps
+list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
diff --git a/docs/transformers/docs/source/en/cache_explanation.md b/docs/transformers/docs/source/en/cache_explanation.md
new file mode 100644
index 0000000000000000000000000000000000000000..59496e4298fc602a1684b37fe34c652edd4baea5
--- /dev/null
+++ b/docs/transformers/docs/source/en/cache_explanation.md
@@ -0,0 +1,96 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Caching
+
+Imagine you’re having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
+
+You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
+
+To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations.
+
+To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token!
+
+A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute.
+
+> [!WARNING]
+> Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training.
+
+## Cache class
+
+When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information.
+
+1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input.
+
+2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values.
+
+3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`.
+
+The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+past_key_values = DynamicCache()
+messages = [{"role": "user", "content": "Hello, what's your name."}]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+
+generated_ids = inputs.input_ids
+cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+max_new_tokens = 10
+
+for _ in range(max_new_tokens):
+    outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
+    # Greedily sample one next token
+    next_token_ids = outputs.logits[:, -1:].argmax(-1)
+    generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+    # Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
+    # and expanding attn mask for the new token, as explained above
+    attention_mask = inputs["attention_mask"]
+    attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+    inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
+    cache_position = cache_position[-1:] + 1 # add one more position for the next token
+
+print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
+"[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
+```
+
+## Legacy cache format
+
+Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format has is dynamic because it grows as text is generated, similar to [`DynamicCache`].
+
+If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+# `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
+# in the legacy format
+generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
+
+cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
+legacy_format_cache = cache.to_legacy_cache()
+```
\ No newline at end of file
diff --git a/docs/transformers/docs/source/en/chat_extras.md b/docs/transformers/docs/source/en/chat_extras.md
new file mode 100644
index 0000000000000000000000000000000000000000..863b0f06f2e90793d1a30e103ebf44d1d0855f3f
--- /dev/null
+++ b/docs/transformers/docs/source/en/chat_extras.md
@@ -0,0 +1,299 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tools and RAG
+
+The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
+
+This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
+
+## Tools
+
+Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
+
+Follow the rules below when creating a tool.
+
+1. The function should have a descriptive name.
+2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
+3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
+4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
+
+An example tool to get temperature and wind speed is shown below.
+
+```py
+def get_current_temperature(location: str, unit: str) -> float:
+    """
+    Get the current temperature at a location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
+    Returns:
+        The current temperature at the specified location in the specified units, as a float.
+    """
+    return 22.  # A real function should probably actually get the temperature!
+
+def get_current_wind_speed(location: str) -> float:
+    """
+    Get the current wind speed in km/h at a given location.
+    
+    Args:
+        location: The location to get the temperature for, in the format "City, Country"
+    Returns:
+        The current wind speed at the given location in km/h, as a float.
+    """
+    return 6.  # A real function should probably actually get the wind speed!
+
+tools = [get_current_temperature, get_current_wind_speed]
+```
+
+Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
+model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+Create a chat message.
+
+```py
+messages = [
+  {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
+  {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
+]
+```
+
+Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+outputs = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+<tool_call>
+{"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
+</tool_call><|im_end|>
+```
+
+The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature. 
+
+Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
+
+> [!WARNING]
+> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
+
+<hfoptions id="tool-call">
+<hfoption id="Llama">
+
+```py
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
+```
+
+Allow the assistant to read the function outputs and chat with the user.
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+```txt
+The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+```
+
+</hfoption>
+<hfoption id="Mistral/Mixtral">
+
+For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.
+
+```py
+tool_call_id = "9Ae3bDc2F"
+tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
+messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
+```
+
+```py
+inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
+inputs = {k: v for k, v in inputs.items()}
+out = model.generate(**inputs, max_new_tokens=128)
+print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
+```
+
+</hfoption>
+</hfoptions>
+
+## Schema
+
+[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
+
+The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
+
+```py
+from transformers.utils import get_json_schema
+
+def multiply(a: float, b: float):
+    """
+    A function that multiplies two numbers
+    
+    Args:
+        a: The first number to multiply
+        b: The second number to multiply
+    """
+    return a * b
+
+schema = get_json_schema(multiply)
+print(schema)
+```
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
+
+> [!WARNING]
+> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
+
+The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
+
+```py
+# A simple function that takes no arguments
+current_time = {
+  "type": "function", 
+  "function": {
+    "name": "current_time",
+    "description": "Get the current local time as a string.",
+    "parameters": {
+      'type': 'object',
+      'properties': {}
+    }
+  }
+}
+
+# A more complete function that takes two numerical arguments
+multiply = {
+  'type': 'function',
+  'function': {
+    'name': 'multiply',
+    'description': 'A function that multiplies two numbers', 
+    'parameters': {
+      'type': 'object', 
+      'properties': {
+        'a': {
+          'type': 'number',
+          'description': 'The first number to multiply'
+        }, 
+        'b': {
+          'type': 'number', 'description': 'The second number to multiply'
+        }
+      }, 
+      'required': ['a', 'b']
+    }
+  }
+}
+
+model_input = tokenizer.apply_chat_template(
+    messages,
+    tools = [current_time, multiply]
+)
+```
+
+## RAG
+
+Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
+
+> [!TIP]
+> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
+
+Create a list of documents to pass to the model.
+
+```py
+documents = [
+    {
+        "title": "The Moon: Our Age-Old Foe", 
+        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
+    },
+    {
+        "title": "The Sun: Our Age-Old Friend",
+        "text": "Although often underappreciated, the sun provides several notable benefits..."
+    }
+]
+```
+
+Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# Load the model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
+device = model.device # Get the device the model is loaded on
+
+# Define conversation input
+conversation = [
+    {"role": "user", "content": "What has Man always dreamed of?"}
+]
+
+input_ids = tokenizer.apply_chat_template(
+    conversation=conversation,
+    documents=documents,
+    chat_template="rag",
+    tokenize=True,
+    add_generation_prompt=True,
+    return_tensors="pt").to(device)
+
+# Generate a response 
+generated_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+    )
+
+# Decode and print the generated text along with generation prompt
+generated_text = tokenizer.decode(generated_tokens[0])
+print(generated_text)
+```
diff --git a/docs/transformers/docs/source/en/chat_templating.md b/docs/transformers/docs/source/en/chat_templating.md
new file mode 100644
index 0000000000000000000000000000000000000000..d11b8fa5b4046298d0f980204994fd79e4dcafc0
--- /dev/null
+++ b/docs/transformers/docs/source/en/chat_templating.md
@@ -0,0 +1,229 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Templates
+
+The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
+
+In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
+
+<hfoptions id="template">
+<hfoption id="Mistral">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
+```
+
+</hfoption>
+<hfoption id="Zephyr">
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+chat = [
+  {"role": "user", "content": "Hello, how are you?"},
+  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
+  {"role": "user", "content": "I'd like to show off how chat templating works!"},
+]
+
+tokenizer.apply_chat_template(chat, tokenize=False)
+```
+```md
+<|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
+```
+
+</hfoption>
+</hfoptions>
+
+This guide explores [`apply_chat_template`] and chat templates in more detail.
+
+## apply_chat_template
+
+Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
+
+Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
+
+messages = [
+    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
+    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
+ ]
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+print(tokenizer.decode(tokenized_chat[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+```
+
+Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
+
+```py
+outputs = model.generate(tokenized_chat, max_new_tokens=128) 
+print(tokenizer.decode(outputs[0]))
+```
+```md
+<|system|>
+You are a friendly chatbot who always responds in the style of a pirate</s>
+<|user|>
+How many helicopters can a human eat in one sitting?</s>
+<|assistant|>
+Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
+```
+
+### add_generation_prompt
+The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
+
+Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
+
+```py
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+tokenized_chat
+```
+```md
+<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+```
+
+### continue_final_message
+
+The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
+
+This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
+
+```py
+chat = [
+    {"role": "user", "content": "Can you format the answer in JSON?"},
+    {"role": "assistant", "content": '{"name": "'},
+]
+
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
+model.generate(**formatted_chat)
+```
+
+> [!WARNING]
+> You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
+
+[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
+
+## Multiple templates
+
+A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
+
+When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
+
+For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
+
+To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
+
+It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
+
+## Template selection
+
+It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
+
+But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
+
+```py
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+```
+
+Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
+
+```py
+tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+```
+
+The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
+
+```py
+<|im_start|>system
+You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
+<|im_start|>user
+How are you?<|im_end|>
+<|im_start|>assistant
+I'm doing great!<|im_end|>
+```
+
+## Model training
+
+Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
+
+An example of preprocessing a dataset with a chat template is shown below.
+
+```py
+from transformers import AutoTokenizer
+from datasets import Dataset
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+
+chat1 = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+    {"role": "assistant", "content": "The sun."}
+]
+chat2 = [
+    {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
+    {"role": "assistant", "content": "A bacterium."}
+]
+
+dataset = Dataset.from_dict({"chat": [chat1, chat2]})
+dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
+print(dataset['formatted_chat'][0])
+```
+```md
+<|user|>
+Which is bigger, the moon or the sun?</s>
+<|assistant|>
+The sun.</s>
+```
+
+After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
+
+Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
+
+```py
+apply_chat_template(messages, tokenize=False, add_special_tokens=False)
+```
+
+This isn’t an issue if `apply_chat_template(tokenize=True)`.
diff --git a/docs/transformers/docs/source/en/chat_templating_multimodal.md b/docs/transformers/docs/source/en/chat_templating_multimodal.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a01f652aaa64d8b1123101ecef2593ccd6a7a79
--- /dev/null
+++ b/docs/transformers/docs/source/en/chat_templating_multimodal.md
@@ -0,0 +1,243 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multimodal templates
+
+Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
+
+Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
+
+This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
+
+## ImageTextToTextPipeline
+
+[`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+Start by building a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
+
+> [!TIP]
+> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. 
+
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
+pipeline(text=messages, max_new_tokens=50, return_full_text=False)
+[{'input_text': [{'role': 'system',
+    'content': [{'type': 'text',
+      'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
+   {'role': 'user',
+    'content': [{'type': 'image',
+      'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
+     {'type': 'text', 'text': 'What are these?'}]}],
+  'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
+```
+
+## Image inputs
+
+For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
+
+- The content `"type"` can be an `"image"` or `"text"`.
+- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+            {"type": "text", "text": "What are these?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
+
+```py
+processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+## Video inputs
+
+Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
+
+- The content `"type"` should be `"video"` to indicate the content is a video.
+- For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
+
+> [!WARNING]
+> Loading a video from `"url"` is only supported by the PyAV or Decord backends.
+
+```python
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+
+model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained(model_id)
+
+messages = [
+    {
+      "role": "system",
+      "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+```
+
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
+
+The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
+
+The examples below use Decord as the backend because it is a bit faster than PyAV.
+
+<hfoptions id="sampling">
+<hfoption id="fixed number of frames">
+
+The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
+
+
+```python
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    num_frames=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+</hfoption>
+<hfoption id="fps">
+
+For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
+
+```py
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    video_fps=32,
+    video_load_backend="decord",
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+<hfoption id="list of image frames">
+
+Videos may also exist as a set of sampled frames stored as images rather than the full video file.
+
+In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video.
+
+```py
+frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
+    },
+    {
+      "role": "user",
+      "content": [
+            {"type": "video", "path": frames_paths},
+            {"type": "text", "text": "What do you see in this video?"},
+        ],
+    },
+]
+
+processed_chat = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+)
+print(processed_chat.keys())
+```
+
+</hfoption>
+</hfoptions>
+
+## Template configuration
+
+You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
+
+For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
+
+```jinja
+{% for message in messages %}
+{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
+{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+{% if message['content'] is string %}
+{{ message['content'] }}
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' %}
+{{ '<|image|>' }}
+{% elif content['type'] == 'text' %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}
+{% endif %}
+{{ '<|eot_id|>' }}
+{% endfor %}
+{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
+```
diff --git a/docs/transformers/docs/source/en/chat_templating_writing.md b/docs/transformers/docs/source/en/chat_templating_writing.md
new file mode 100644
index 0000000000000000000000000000000000000000..fbcec9f71c01972ae9a451abf1c96cde649e32f9
--- /dev/null
+++ b/docs/transformers/docs/source/en/chat_templating_writing.md
@@ -0,0 +1,251 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Template writing
+
+A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
+
+1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
+2. Print the message followed by an end-of-sequence (`EOS`) token.
+3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
+
+An example template is shown below.
+
+```jinja
+{%- for message in messages %}
+    {{- '<|' + message['role'] + |>\n' }}
+    {{- message['content'] + eos_token }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|assistant|>\n' }}
+{%- endif %}
+```
+
+The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
+
+## Create a template
+
+Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
+
+```jinja
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+    {%- endif %}
+{%- endfor %}
+```
+
+Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
+
+```py
+template = tokenizer.chat_template
+template = template.replace("SYS", "SYSTEM")  # Change the system token
+tokenizer.chat_template = template  # Set the new template
+```
+
+The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
+
+```py
+tokenizer.push_to_hub("model_name")
+```
+
+## Template writing tips
+
+The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
+
+This section curates some best practices for writing clean and efficient Jinja templates.
+
+### Trimming whitespace
+
+Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
+
+```jinja
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```
+
+The incorrect whitespace usage example below may introduce a newline and indentation in the output.
+
+```jinja
+{% for message in messages %}
+    {{ message['role'] + message['content'] }}
+{% endfor %}
+```
+
+### Special variables
+
+There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
+
+- `messages` contains the chat history as a list of message dicts.
+- `tools` contains a list of tools in JSON schema format.
+- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
+- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
+- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
+
+### Callable functions
+
+There are two callable functions available inside a template.
+
+- `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
+- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
+
+### Compatibility with non-Python Jinja
+
+Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust.
+
+Make the changes below to ensure compatibility across all Jinja implementations.
+
+- Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters.
+- Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively.
+- Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency.
+
+### Big templates
+
+Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
+
+Write the template in a separate file and extract it to the chat template.
+
+```py
+open("template.jinja", "w").write(tokenizer.chat_template)
+```
+
+You could also load an edited template back into the tokenizer.
+
+```py
+tokenizer.chat_template = open("template.jinja").read()
+```
+
+## Templates for tools
+
+There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model.
+
+> [!WARNING]
+> Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with.
+
+The following section lists elements of the standard API for writing templates for tools.
+
+### Tool definitions
+
+Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
+
+The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
+
+```json
+{
+  "type": "function", 
+  "function": {
+    "name": "multiply", 
+    "description": "A function that multiplies two numbers", 
+    "parameters": {
+      "type": "object", 
+      "properties": {
+        "a": {
+          "type": "number", 
+          "description": "The first number to multiply"
+        }, 
+        "b": {
+          "type": "number",
+          "description": "The second number to multiply"
+        }
+      }, 
+      "required": ["a", "b"]
+    }
+  }
+}
+```
+
+An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
+
+```
+{%- if tools %}
+    {%- for tool in tools %}
+        {{- '<tool>' + tool['function']['name'] + '\n' }}
+        {%- for argument in tool['function']['parameters']['properties'] %}
+            {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
+        {%- endfor %}
+        {{- '\n</tool>' }}
+    {%- endif %}
+{%- endif %}
+```
+
+### Tool calls
+
+Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
+
+```json
+{
+  "role": "assistant",
+  "tool_calls": [
+    {
+      "type": "function",
+      "function": {
+        "name": "multiply",
+        "arguments": {
+          "a": 5,
+          "b": 6
+        }
+      }
+    }
+  ]
+}
+```
+
+A common pattern for handling tool calls is shown below.
+
+```
+{%- if message['role'] == 'assistant' and 'tool_calls' in message %}
+    {%- for tool_call in message['tool_calls'] %}
+            {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+```
+
+### Tool responses
+
+Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
+
+```json
+{
+  "role": "tool",
+  "name": "multiply",
+  "content": "30"
+}
+```
+
+Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
+
+```
+{%- if message['role'] == 'tool' %}
+    {{- "<tool_result>" + message['content'] + "</tool_result>" }}
+{%- endif %}
+```
+
+## Contribute
+
+Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
+
+Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
+
+```py
+tokenizer.chat_template = template
+tokenizer.push_to_hub("model_name")
+```
diff --git a/docs/transformers/docs/source/en/community.md b/docs/transformers/docs/source/en/community.md
new file mode 100644
index 0000000000000000000000000000000000000000..1b77bee9d2ded1c2836f98bbf6bb3a9f9b00ac1f
--- /dev/null
+++ b/docs/transformers/docs/source/en/community.md
@@ -0,0 +1,70 @@
+<!--⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# Community
+
+This page regroups resources around 🤗 Transformers developed by the community.
+
+## Community resources:
+
+| Resource     |      Description      |      Author      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learned/revised using [Anki](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## Community notebooks:
+
+| Notebook     |      Description      |      Author      |      |
+|:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
+| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | How to use [QLoRA](https://github.com/artidoro/qlora) and [PEFT](https://huggingface.co/docs/peft/en/index) to fine-tune an LLM in a memory-efficient way, while using [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) to manage experiment tracking | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
diff --git a/docs/transformers/docs/source/en/contributing.md b/docs/transformers/docs/source/en/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..c4804c69fb034fe0f83bf0e3bec11881c007db6c
--- /dev/null
+++ b/docs/transformers/docs/source/en/contributing.md
@@ -0,0 +1,395 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Contribute to 🤗 Transformers
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+contributions are not the only way to help the community. Answering questions, helping
+others, and improving the documentation are also immensely valuable.
+
+It also helps us if you spread the word! Reference the library in blog posts
+about the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply ⭐️ the repository to say thank you.
+
+However you choose to contribute, please be mindful and respect our
+[code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
+
+**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
+
+## Ways to contribute
+
+There are several ways you can contribute to 🤗 Transformers:
+
+* Fix outstanding issues with the existing code.
+* Submit issues related to bugs or desired new features.
+* Implement new models.
+* Contribute to the examples or to the documentation.
+
+If you don't know where to start, there is a special [Good First
+Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
+open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
+
+For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
+
+> All contributions are equally valuable to the community. 🥰
+
+## Fixing outstanding issues
+
+If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request!
+
+## Submitting a bug-related issue or feature request
+
+Do your best to follow these guidelines when submitting a bug-related issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.
+
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) or on our [discord](https://discord.com/invite/hugging-face-879548962464493619) first. This helps us respond quicker to fixing issues related to the library versus general questions.
+
+> [!TIP]
+> We have a [docs bot](https://huggingface.co/spaces/huggingchat/hf-docs-chat), and we highly encourage you to ask all your questions there. There is always a chance your bug can be fixed with a simple flag 👾🔫
+
+Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
+
+* Your **OS type and version** and **Python**, **PyTorch** and
+  **TensorFlow** versions when applicable.
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s.
+* The *full* traceback if an exception is raised.
+* Attach any other additional information, like screenshots, you think may help.
+
+To get the OS and software versions automatically, run the following command:
+
+```bash
+transformers-cli env
+```
+
+You can also run the same command from the root of the repository:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+### Do you want a new feature?
+
+If there is a new feature you'd like to see in 🤗 Transformers, please open an issue and describe:
+
+1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?
+
+   Whatever it is, we'd love to hear about it!
+
+2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
+3. Provide a *code snippet* that demonstrates the features usage.
+4. If the feature is related to a paper, please include a link.
+
+If your issue is well written we're already 80% of the way there by the time you create it.
+
+We have added [templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with your issue.
+
+## Do you want to implement a new model?
+
+New models are constantly released and if you want to implement a new model, please provide the following information:
+
+* A short description of the model and a link to the paper.
+* Link to the implementation if it is open-sourced.
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!
+
+We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
+
+## Do you want to add documentation?
+
+We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested!
+
+For more details about how to generate, build, and write the documentation, take a look at the documentation [README](https://github.com/huggingface/transformers/tree/main/docs).
+
+## Create a Pull Request
+
+Before writing any code, we strongly advise you to search through the existing PRs or
+issues to make sure nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to contribute to
+🤗 Transformers. While `git` is not the easiest tool to use, it has the greatest
+manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by
+   clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   git clone git@github.com:<your Github handle>/transformers.git
+   cd transformers
+   git remote add upstream https://github.com/huggingface/transformers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   🚨 **Do not** work on the `main` branch!
+
+4. Set up a development environment by running the following command in a virtual environment:
+
+   ```bash
+   pip install -e ".[dev]"
+   ```
+
+   If 🤗 Transformers was already installed in the virtual environment, remove
+   it with `pip uninstall transformers` before reinstalling it in editable
+   mode with the `-e` flag.
+
+   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
+   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
+   (PyTorch, TensorFlow and/or Flax) then do:
+
+   ```bash
+   pip install -e ".[quality]"
+   ```
+
+   which should be enough for most use cases.
+
+5. Develop the features in your branch.
+
+   As you work on your code, you should make sure the test suite
+   passes. Run the tests impacted by your changes like this:
+
+   ```bash
+   pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   For more information about tests, check out the
+   [Testing](https://huggingface.co/docs/transformers/testing) guide.
+
+   🤗 Transformers relies on `black` and `ruff` to format its source code
+   consistently. After you make changes, apply automatic style corrections and code verifications
+   that can't be automated in one go with:
+
+   ```bash
+   make fixup
+   ```
+
+   This target is also optimized to only work with files modified by the PR you're working on.
+
+   If you prefer to run the checks one after the other, the following command applies the
+   style corrections:
+
+   ```bash
+   make style
+   ```
+
+   🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
+   controls are run by the CI, but you can run the same checks with:
+
+   ```bash
+   make quality
+   ```
+
+   Finally, we have a lot of scripts to make sure we don't forget to update
+   some files when adding a new model. You can run these scripts with:
+
+   ```bash
+   make repo-consistency
+   ```
+
+   To learn more about those checks and how to fix any issues with them, check out the
+   [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
+
+   If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
+   make sure you install the [documentation builder](https://github.com/huggingface/doc-builder).
+
+   ```bash
+   pip install hf-doc-builder
+   ```
+
+   Run the following command from the root of the repository:
+
+   ```bash
+   doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
+   ```
+
+   This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
+   Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.
+
+   Once you're happy with your changes, add the changed files with `git add` and
+   record your changes locally with `git commit`:
+
+   ```bash
+   git add modified_file.py
+   git commit
+   ```
+
+   Please remember to write [good commit
+   messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made!
+
+   To keep your copy of the code up to date with the original
+   repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
+
+   ```bash
+   git fetch upstream
+   git rebase upstream/main
+   ```
+
+   Push your changes to your branch:
+
+   ```bash
+   git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+   If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
+
+6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
+
+7. It's ok if maintainers request changes, it happens to our core contributors
+   too! So everyone can see the changes in the pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+### Pull request checklist
+
+☐ The pull request title should summarize your contribution.<br>
+☐ If your pull request addresses an issue, please mention the issue number in the pull
+request description to make sure they are linked (and people viewing the issue know you
+are working on it).<br>
+☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
+useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
+☐ Make sure existing tests pass.<br>
+☐ If adding a new feature, also add tests for it.<br>
+   - If you are adding a new model, make sure you use
+     `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
+   - If you are adding new `@slow` tests, make sure they pass using
+     `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
+   - If you are adding a new tokenizer, write tests and make sure
+     `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
+   - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
+
+☐ All public methods must have informative docstrings (see
+[`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
+for an example).<br>
+☐ Due to the rapidly growing repository, don't add any images, videos and other
+non-text files that'll significantly weigh down the repository. Instead, use a Hub
+repository such as [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)
+to host these files and reference them by URL. We recommend placing documentation
+related images in the following repository:
+[huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
+You can open a PR on this dataset repository and ask a Hugging Face member to merge it.
+
+For more information about the checks run on a pull request, take a look at our [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests](https://github.com/huggingface/transformers/tree/main/tests) folder and examples tests in the
+[examples](https://github.com/huggingface/transformers/tree/main/examples) folder.
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, specify a *path to a subfolder or a test file* to run the test:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+```
+
+Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory:
+
+```bash
+pip install -r examples/xxx/requirements.txt  # only needed the first time
+python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)!
+
+You can also specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped but you can set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models so make sure you
+have enough disk space, a good internet connection or a lot of patience!
+
+<Tip warning={true}>
+
+Remember to specify a *path to a subfolder or a test file* to run the test. Otherwise, you'll run all the tests in the `tests` or `examples` folder, which will take a very long time!
+
+</Tip>
+
+```bash
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
+RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
+```
+
+Like the slow tests, there are other environment variables available which are not enabled by default during testing:
+- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
+
+More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
+
+🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
+`pytest`-specific features in the test suite itself.
+
+This means `unittest` is fully supported. Here's how to run tests with
+`unittest`:
+
+```bash
+python -m unittest discover -s tests -t . -v
+python -m unittest discover -s examples -t examples -v
+```
+
+### Style guide
+
+For documentation strings, 🤗 Transformers follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
+Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
+for more information.
+
+### Develop on Windows
+
+On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+
+```bash
+git config core.autocrlf input
+```
+
+One way to run the `make` command on Windows is with MSYS2:
+
+1. [Download MSYS2](https://www.msys2.org/), and we assume it's installed in `C:\msys64`.
+2. Open the command line `C:\msys64\msys2.exe` (it should be available from the **Start** menu).
+3. Run in the shell: `pacman -Syu` and install `make` with `pacman -S make`.
+4. Add `C:\msys64\usr\bin` to your PATH environment variable.
+
+You can now use `make` from any terminal (PowerShell, cmd.exe, etc.)! 🎉
+
+### Sync a forked repository with upstream main (the Hugging Face repository)
+
+When updating the main branch of a forked repository, please follow these steps to avoid pinging the upstream repository which adds reference notes to each upstream PR, and sends unnecessary notifications to the developers involved in these PRs.
+
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+
+   ```bash
+   git checkout -b your-branch-for-syncing
+   git pull --squash --no-commit upstream main
+   git commit -m '<your message without GitHub references>'
+   git push --set-upstream origin your-branch-for-syncing
+   ```
diff --git a/docs/transformers/docs/source/en/conversations.md b/docs/transformers/docs/source/en/conversations.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e842265a215897e1e3f563856d9cd4fb4a510a1
--- /dev/null
+++ b/docs/transformers/docs/source/en/conversations.md
@@ -0,0 +1,154 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Chat basics
+
+Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.
+
+Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.
+
+> [!TIP]
+> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!
+
+This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
+
+## transformers-cli
+
+Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
+
+```bash
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
+</div>
+
+For a full list of options, run the command below.
+
+```bash
+transformers-cli chat -h
+```
+
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
+
+## TextGenerationPipeline
+
+[`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
+
+To start, build a chat history with the following two roles.
+
+- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
+- `user` is where you enter your first message to the model.
+
+```py
+chat = [
+    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
+    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+]
+```
+
+Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
+```
+
+```txt
+(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
+alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
+
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
+some wild stuff, like that Warhol guy's soup cans and all that jazz.
+
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
+those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
+
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
+even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
+
+And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
+pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)
+
+So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
+excuse me, I've got some oil changes to attend to. (winks)
+```
+
+Use the `append` method on `chat` to respond to the models message.
+
+```py
+chat = response[0]["generated_text"]
+chat.append(
+    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+)
+response = pipeline(chat, max_new_tokens=512)
+print(response[0]["generated_text"][-1]["content"])
+```
+
+```txt
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
+(sarcastically) Oh, yeah, real original, Andy.
+
+But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
+status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
+And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)
+
+But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
+But, hey, that's what makes art, art, right? (laughs)
+```
+
+## Performance
+
+Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).
+
+> [!TIP]
+> Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.
+
+Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
+
+```py
+from transformers import pipeline, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
+```
+
+In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.
+
+The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.
+
+| Hardware | Memory bandwidth |
+|---|---|
+| consumer CPU | 20-100GB/sec |
+| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
+| data center GPU (NVIDIA A100/H100) | 2-3TB/sec |
+
+The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.
+
+You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
+
+> [!TIP]
+> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
diff --git a/docs/transformers/docs/source/en/custom_models.md b/docs/transformers/docs/source/en/custom_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..592aa9aa105f15a0248f04fe0e81d750195c0840
--- /dev/null
+++ b/docs/transformers/docs/source/en/custom_models.md
@@ -0,0 +1,297 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Customizing models
+
+Transformers models are designed to be customizable. A models code is fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model.
+
+> [!TIP]
+> It may be easier to start from scratch if you're creating an entirely new model. But for models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class.
+
+This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) support, and share it on the Hub.
+
+## Configuration
+
+A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types.
+
+The main rules for customizing a configuration are:
+
+1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`].
+2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones set in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass.
+
+> [!TIP]
+> It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values.
+>
+> Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support.
+
+```py
+from transformers import PretrainedConfig
+from typing import List
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+Save the configuration to a JSON file in your custom model folder, `custom-resnet`, with [`~PretrainedConfig.save_pretrained`].
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+## Model
+
+With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model.
+
+Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the model sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers.
+
+Writing models this way produces simpler code with a clear source of truth for any hyperparameters. It also makes it easier to reuse code from other Transformers' models.
+
+You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head.
+
+<hfoptions id="resnet">
+<hfoption id="ResnetModel">
+
+Define a mapping between the block types and classes. Everything else is created by passing the configuration class to the ResNet model class.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+</hfoption>
+<hfoption id="ResnetModelForImageClassification">
+
+The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
+
+> [!TIP]
+> Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
+
+```py
+import torch
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.functional.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+</hfoption>
+</hfoptions>
+
+A model can return any output format. Returning a dictionary (like `ResnetModelForImageClassification`) with losses when labels are available makes the custom model compatible with [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training.
+
+Instantiate the custom model class with the configuration.
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+At this point, you can load pretrained weights into the model or train it from scratch. In this guide, you'll load pretrained weights.
+
+Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict).
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+## AutoClass
+
+The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It is convenient to enable this for users loading your custom model.
+
+Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. Use the [`~AutoConfig.register`] method to add the custom configuration and model to the [AutoClass](./models#model-classes) API.
+
+> [!TIP]
+> The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class.
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the [AutoModel](./model_doc/auto#automodel) or [`AutoModelForImageClassification`] classes.
+
+## Upload
+
+Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it.
+
+Ensure the model directory is structured correctly as shown below. The directory should contain:
+
+- `modeling.py`: Contains the code for `ResnetModel` and `ResnetModelForImageClassification`. This file can rely on relative imports to other files as long as they're in the same directory.
+
+> [!WARNING]
+> When copying a Transformers' model file, replace all relative imports at the top of the `modeling.py` file to import from Transformers instead.
+
+- `configuration.py`: Contains the code for `ResnetConfig`.
+- `__init__.py`: Can be empty, this file allows Python `resnet_model` to be used as a module.
+
+```bash
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+To share the model, import the ResNet model and configuration.
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+Copy the code from the model and configuration files. To make sure the AutoClass objects are saved with [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping.
+
+For a model, pick the appropriate `AutoModelFor` class based on the task.
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+To map more than one task to the model, edit `auto_map` in the configuration JSON file directly.
+
+```json
+"auto_map": {
+    "AutoConfig": "<your-repo-name>--<config-name>",
+    "AutoModel": "<your-repo-name>--<config-name>",
+    "AutoModelFor<Task>": "<your-repo-name>--<config-name>",    
+},
+```
+
+Create the configuration and model and load pretrained weights into it.
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+The model is ready to be pushed to the Hub now. Log in to your Hugging Face account from the command line or notebook.
+
+<hfoptions id="push">
+<hfoption id="huggingface-CLI">
+
+```bash
+huggingface-cli login
+```
+
+</hfoption>
+<hfoption id="notebook">
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+</hfoption>
+</hfoptions>
+
+Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hub.
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+The pretrained weights, configuration, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now in a [repository](https://hf.co/sgugger/custom-resnet50d) under your namespace.
+
+Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in [`~PreTrainedModel.from_pretrained`] to load it. Refer to the load [custom models](./models#custom-models) section for more information.
diff --git a/docs/transformers/docs/source/en/debugging.md b/docs/transformers/docs/source/en/debugging.md
new file mode 100644
index 0000000000000000000000000000000000000000..09394d2229d149a2e9300469573fd6ba29c1d938
--- /dev/null
+++ b/docs/transformers/docs/source/en/debugging.md
@@ -0,0 +1,367 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multi-GPU debugging
+
+Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system. You may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model.
+
+This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch.
+
+## DeepSpeed CUDA
+
+DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2*
+
+```bash
+pip install deepspeed
+```
+
+> [!TIP]
+> For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.
+
+### Non-identical toolkits
+
+PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.
+
+The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
+
+```bash
+which nvcc
+```
+
+### Multiple toolkits
+
+You may also have more than one CUDA toolkit installed on your system.
+
+```bash
+/usr/local/cuda-10.2
+/usr/local/cuda-11.0
+```
+
+Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.
+
+Take a look at the contents of the following environment variables first.
+
+```bash
+echo $PATH
+echo $LD_LIBRARY_PATH
+```
+
+`PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To find a specific CUDA toolkit, insert the correct path to list first. This command prepends rather than overwrites the existing values.
+
+```bash
+# adjust the version and full path if needed
+export PATH=/usr/local/cuda-10.2/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
+```
+
+In addition, you should also check that the assigned directories actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.
+
+### Older versions
+
+Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler.
+
+You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, create a symlink to give the build system visibility to the older compiler.
+
+```bash
+# adjust the path to your system
+sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
+sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
+```
+
+### Prebuild
+
+If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, try to prebuild the DeepSpeed modules before installing them. Run the commands below to make a local build for DeepSpeed.
+
+```bash
+git clone https://github.com/deepspeedai/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
+--global-option="build_ext" --global-option="-j8" --no-cache -v \
+--disable-pip-version-check 2>&1 | tee build.log
+```
+
+> [!TIP]
+> Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system.
+
+Next, specify your GPUs architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command.
+
+```bash
+python -c "import torch; print(torch.cuda.get_arch_list())"
+```
+
+Find the architecture for a GPU with the following command.
+
+<hfoptions id="arch">
+<hfoption id="same GPUs">
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
+```
+
+</hfoption>
+<hfoption id="specific GPU">
+
+Run the following command to find the architecture for GPU `0`. The results will show a value for `major` and `minor`, which is your GPU architecture. The GPU architecture below is `8.6`.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+print(torch.cuda.get_device_properties(torch.device('cuda')))
+"_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)"
+```
+
+</hfoption>
+</hfoptions>
+
+If you get `8, 6`, then you can set `TORCH_CUDA_ARCH_LIST="8.6"`. For multiple GPUs with different architectures, list them like `TORCH_CUDA_ARCH_LIST="6.1;8.6"`.
+
+It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program automatically queries the GPU architecture of the build. However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture.
+
+For training on multiple machines with the same setup, you'll need to make a binary wheel as shown below.
+
+```bash
+git clone https://github.com/deepspeedai/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
+python setup.py build_ext -j8 bdist_wheel
+```
+
+This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Install this wheel locally or on another machine.
+
+```bash
+pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
+```
+
+## Communication
+
+Distributed training involves communication between processes and or nodes and this can be a potential source of errors.
+
+Download the script below to diagnose network issues, and then run it to test GPU communication. The example command below tests how two GPUs communicate. Adjust the `--nproc_per_node` and `--nnodes` parameters to adapt it to your system.
+
+```bash
+wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
+python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+The script prints an `OK` status if both GPUs are able to communicate and allocate memory. Take a closer look at the diagnostic script for more details and a recipe for running it in a SLURM environment.
+
+Add the `NCCL_DEBUG=INFO` environment variable to report more NCCL-related debugging information.
+
+```bash
+NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
+```
+
+## Underflow and overflow detection
+
+Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class.
+
+<hfoptions id="overflow">
+<hfoption id="Trainer">
+
+```py
+from transformers import TrainingArguments
+
+args = TrainingArguments(
+    debug="underflow_overflow",
+    ...
+)
+```
+
+</hfoption>
+<hfoption id="PyTorch training loop">
+
+```py
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model)
+```
+
+</hfoption>
+</hfoptions>
+
+The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below.
+
+The example below is for fp16 mixed precision training with [google/mt5-small](https://huggingface.co/google/mt5-small).
+
+```shell
+Detected inf/nan during batch_number=0
+Last 21 forward frames:
+abs min  abs max  metadata
+                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 2.57e+02 input[0]
+0.00e+00 2.85e+02 output
+[...]
+                  encoder.block.2.layer.0 T5LayerSelfAttention
+6.78e-04 3.15e+03 input[0]
+2.65e-04 3.42e+03 output[0]
+             None output[1]
+2.25e-01 1.00e+04 output[2]
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+2.17e-07 4.50e+00 weight
+1.79e-06 4.65e+00 input[0]
+2.68e-06 3.70e+01 output
+                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+8.08e-07 2.66e+01 weight
+1.79e-06 4.65e+00 input[0]
+1.27e-04 2.37e+02 output
+                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+0.00e+00 8.76e+03 input[0]
+0.00e+00 9.74e+03 output
+                  encoder.block.2.layer.1.DenseReluDense.wo Linear
+1.01e-06 6.44e+00 weight
+0.00e+00 9.74e+03 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+At the start of the report, you can see which batch number the error occurred. In this case, it occurred on the first batch.
+
+Each frame describes the module it is reporting on. For example, the frame below inspected `encoder.block.2.layer.1.layer_norm`. This indicates the layer norm in the first layer of the second block of the encoder. The forward calls are to `T5LayerNorm`.
+
+```shell
+                  encoder.block.2.layer.1.layer_norm T5LayerNorm
+8.69e-02 4.18e-01 weight
+2.65e-04 3.42e+03 input[0]
+1.79e-06 4.65e+00 output
+```
+
+The last frame reports on the `Dropout.forward` function. It called the `dropout` attribute from inside the `DenseReluDense` class. You can observe that the overflow (`inf`) occurred in the first layer of the encoders second block in the first batch. The absolute largest input element was 6.27e+04.
+
+```shell
+                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+1.79e-06 4.65e+00 input[0]
+3.18e-04 6.27e+04 output
+                  encoder.block.2.layer.1.dropout Dropout
+3.18e-04 6.27e+04 input[0]
+0.00e+00      inf output
+```
+
+The `T5DenseGatedGeluDense.forward` function output activations had an absolute maximum value of 6.27e+04 which is close to fp16s maximum limit of 6.4e+04. In the next step, `Dropout` renormalizes the weights, after zeroing some elements, which pushes the absolute maximum value to greater than 6.4e+04 resulting in an overflow.
+
+Now that you know where the error is happening, you can investigate the modeling code in [modeling_t5.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py).
+
+```py
+class T5DenseGatedGeluDense(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+```
+
+One solution is to go back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`).
+
+```py
+import torch
+
+def forward(self, hidden_states):
+    if torch.is_autocast_enabled():
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
+```
+
+The report only returns inputs and outputs of full frames, so you may also want to analyze the intermediate values of any `forward` function as well. Add the `detect_overflow` function after the forward calls to track `inf` or `nan` values in the intermediate `forwarded_states`.
+
+```py
+from debug_utils import detect_overflow
+
+class T5LayerFF(nn.Module):
+    [...]
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        detect_overflow(forwarded_states, "after layer_norm")
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        detect_overflow(forwarded_states, "after DenseReluDense")
+        return hidden_states + self.dropout(forwarded_states)
+```
+
+Finally, you can configure the number of frames printed by [`~debug_utils.DebugUnderflowOverflow`].
+
+```py
+from transformers.debug_utils import DebugUnderflowOverflow
+
+debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+```
+
+### Batch tracing
+
+[`~debug_utils.DebugUnderflowOverflow`] is able to trace the absolute minimum and maximum values in each batch with the underflow and overflow feature disabled. This is useful for identifying where errors are occurring in the model.
+
+The example below shows how to trace the minimum and maximum values in batches 1 and 3 (batches are zero-indexd).
+
+```py
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
+```
+
+```shell
+                  *** Starting batch number=1 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.47e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+                  decoder.dropout Dropout
+1.60e-07 2.27e+01 input[0]
+0.00e+00 2.52e+01 output
+                  decoder T5Stack
+     not a tensor output
+                  lm_head Linear
+1.01e-06 7.92e+02 weight
+0.00e+00 1.11e+00 input[0]
+6.06e-02 8.39e+01 output
+                   T5ForConditionalGeneration
+     not a tensor output
+
+                  *** Starting batch number=3 ***
+abs min  abs max  metadata
+                  shared Embedding
+1.01e-06 7.92e+02 weight
+0.00e+00 2.78e+04 input[0]
+5.36e-05 7.92e+02 output
+[...]
+```
+
+[`~debug_utils.DebugUnderflowOverflow`] reports on a large number of frames which is easier for debugging. Once you know where a problem is occurring, say batch 150, then you can focus the trace for batches 149 and 150 and compare where the numbers are diverging.
+
+It is also possible to abort the trace after a certain batch number, for example, batch 3.
+
+```py
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
+```
diff --git a/docs/transformers/docs/source/en/deepspeed.md b/docs/transformers/docs/source/en/deepspeed.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d1df98e50a225537228d450ce1e6ebee40bbdcf
--- /dev/null
+++ b/docs/transformers/docs/source/en/deepspeed.md
@@ -0,0 +1,1029 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSpeed
+
+[DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved with the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages.
+
+| ZeRO stage | description |
+|---|---|
+| 1 | partition optimizer states |
+| 2 | partition optimizer and gradient states |
+| 3 | partition optimizer, gradient, and parameters |
+
+Each stage progressively saves more memory, allowing really large models to fit and train on a single GPU. All ZeRO stages, offloading optimizer memory and computations from the GPU to the CPU are integrated with [`Trainer`]. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features.
+
+This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy for training.
+
+Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed [installation](https://www.deepspeed.ai/tutorials/advanced-install/) or GitHUB [README](https://github.com/microsoft/deepspeed#installation).
+
+<hfoptions id="installation">
+<hfoption id="PyPI">
+
+```bash
+pip install deepspeed
+```
+
+</hfoption>
+<hfoption id="Transformers">
+
+```bash
+pip install transformers[deepspeed]
+```
+
+</hfoption>
+</hfoptions>
+
+> [!WARNING]
+> Refer to the [DeepSpeed CUDA installation](./debugging#deepspeed-cuda-issues) if you're having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to ensure it matches your hardware and to support certain features which aren't available in the PyPI distribution.
+
+DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also to need to reserve some memory for the CUDA kernels and activations.
+
+Run the command below to check the memory requirements for [bigscience/T0_3B](https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B) on a single GPU.
+
+```bash
+$ python -c 'from transformers import AutoModel; \
+from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
+model = AutoModel.from_pretrained("bigscience/T0_3B"); \
+estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'
+[...]
+Estimated memory needed for params, optim states and gradients for a:
+HW: Setup with 1 node, 1 GPU per node.
+SW: Model with 2783M total params, 65M largest layer params.
+  per CPU  |  per GPU |   Options
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+    0.37GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=1
+   15.56GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
+```
+
+> [!TIP]
+> If you have enough GPU memory, disable CPU and NVMe offload to speed everything up.
+
+## Choosing a ZeRO stage
+
+Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most.
+
+| fastest | least memory usage |
+|---|---|
+| ZeRO-1 | ZeRO-3 + offload |
+| ZeRO-2 | ZeRO-3 |
+| ZeRO-2 + offload | ZeRO-2 + offload |
+| ZeRO-3 | ZeRO-2 |
+| ZeRO-3 + offload | ZeRO-1 |
+
+Decide the type of performance you're optimizing for, speed or memory, and then work backwards to discover the best ZeRO stage for your use case. For example, if you're optimizing for speed, start with the fastest ZeRO stage and if you run out of memory, try the next stage which is slower but more memory efficient.
+
+## Config file
+
+Once you've decided on a ZeRO stage, set up a config file to enable DeepSpeed with [`Trainer`]. The config file contains all the parameters for how to configure and set up your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used.
+
+> [!TIP]
+> Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. There are also practical examples of various DeepSpeed configuration examples in the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. Run the command below to quickly find specific examples.
+>
+> ```bash
+> git clone https://github.com/microsoft/DeepSpeedExamples
+> cd DeepSpeedExamples
+> find . -name '*json'
+> # find examples with the Lamb optimizer
+> grep -i Lamb $(find . -name '*json')
+> ```
+
+The config file is passed as a path to a JSON file if you're training from the command line interface or as a nested dict object if you're using [`Trainer`] in a notebook.
+
+<hfoptions id="pass-config">
+<hfoption id="path to file">
+
+```py
+TrainingArguments(
+    deepspeed="path/to/deepspeed_config.json",
+    ...,
+)
+```
+
+</hfoption>
+<hfoption id="nested dict">
+
+```py
+ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
+args = TrainingArguments(
+    deepspeed=ds_config_dict,
+    ...,
+)
+trainer = Trainer(
+    model,
+    args,
+    ...,
+)
+```
+
+</hfoption>
+</hfoptions>
+
+### DeepSpeed versus Trainer parameters
+
+There are three types of config parameters.
+
+1. Some config parameters are shared by DeepSpeed and [`Trainer`] making it difficult to identify errors when there are conflicting definitions. In this case, configure these parameters from the [`Trainer`] command line arguments.
+1. Some config parameters are automatically derived from the model configuration and don't need to be manually configured. [`Trainer`] uses the config value `auto` to set the most correct or efficient option. You could define these parameters explicitly, but you must take care to ensure the [`Trainer`] and DeepSpeed config parameters match. Mismatches may cause training to fail in very difficult to detect ways.
+1. Some config parameters are specific to DeepSpeed and should be manually set based on your training requirements.
+
+There are two ways to modify the config parameters.
+
+> [!TIP]
+> Some values, such as `scheduler.params.total_num_steps`, are calculated by [`Trainer`] during training.
+
+1. Create or load a DeepSpeed config to use as the main config.
+1. Create a [`TrainingArguments`] object based on the DeepSpeed config values.
+
+### ZeRO stage
+
+Each ZeRO stage config is defined in `zero_optimization`.
+
+For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be set up with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments.
+
+> [!WARNING]
+> DeepSpeed doesn't validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used.
+
+<hfoptions id="zero-config">
+<hfoption id="ZeRO-1">
+
+ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed up.
+
+```yml
+{
+    "zero_optimization": {
+        "stage": 1
+    }
+}
+```
+
+</hfoption>
+<hfoption id="ZeRO-2">
+
+ZeRO-2 shards the optimizer and gradient states across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include the following.
+
+* `offload_optimizer` should be enabled to reduce GPU memory usage.
+* `overlap_comm` when set to `true` uses more GPU memory in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error.
+* `allgather_bucket_size` and `reduce_bucket_size` trade-off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time.
+* `round_robin_gradients` is available in DeepSpeed 0.4.4 for CPU offloading. It parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
+
+```yml
+{
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+        "round_robin_gradients": true
+    }
+}
+```
+
+</hfoption>
+<hfoption id="ZeRO-3">
+
+ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference in addition to training because it loads large models onto multiple GPUs. Some important parameters to configure include the following.
+
+* `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This offloads model parameters to the CPU.
+* `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory.
+* `stage3_max_live_parameters` is the upper limit on how many full parameters to keep on the GPU at any given time. Reduce this value if you encounter an OOM error.
+* `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. Reduce this value if you encounter an OOM error.
+* `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training.
+* `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you:
+
+    1. Run into an OOM error during the optimization step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers.
+    2. The optimization step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers.
+
+* `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow [`Trainer`] to automatically assign the values.
+
+```yml
+{
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+### Initialize large models
+
+With ZeRO-3, use the [deepspeed.zero.Init](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster.
+
+```py
+from transformers import T5ForConditionalGeneration, T5Config
+import deepspeed
+
+with deepspeed.zero.Init():
+    config = T5Config.from_pretrained("google-t5/t5-small")
+    model = T5ForConditionalGeneration(config)
+```
+
+The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].
+
+> [!TIP]
+> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
+
+```py
+from transformers import AutoModel, Trainer, TrainingArguments
+
+training_args = TrainingArguments(..., deepspeed=ds_config)
+model = AutoModel.from_pretrained("google-t5/t5-small")
+trainer = Trainer(model=model, args=training_args, ...)
+```
+
+When there are multiple GPUs, no single GPU has all the parameters unless it's the parameters of the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. For very large models, it isn't possible to load the weights onto one GPU and then distribute them across the other GPUs due to memory limitations.
+
+If you encounter a model parameter weight where `tensor([1.])` or the parameter size is 1 instead of a larger multidimensional shape, it means the parameter is partitioned and this is a ZeRO-3 placeholder.
+
+```py
+tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
+```
+
+> [!TIP]
+> For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides.
+
+</hfoption>
+</hfoptions>
+
+### NVMe
+
+[ZeRO-Infinity](https://hf.co/papers/2104.07857) offloads model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.
+
+Depending on the CPU and NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none of them. Make sure the `nvme_path` points to a NVMe device, because while it still works with a regular hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read operations and ~3GB/s for write operations.
+
+Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
+
+The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values.
+
+```yaml
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 4,
+            "fast_init": false
+        },
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 5,
+            "buffer_size": 1e8,
+            "max_in_cpu": 1e9
+        },
+        "aio": {
+            "block_size": 262144,
+            "queue_depth": 32,
+            "thread_count": 1,
+            "single_submit": false,
+            "overlap_events": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+```
+
+## Training features
+
+DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features.
+
+### Gradient checkpointing
+
+Gradient checkpointing saves memory by only storing *some* of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though.
+
+* For a Transformers model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`].
+* For a non-Transformers model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them.
+
+### Batch size
+
+The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`.
+
+```yaml
+{
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto"
+}
+```
+
+### Communication data type
+
+A separate data type is used for communication collectives like reduction, gathering and scattering operations.
+
+All gather and scatter operations are performed in the same data type the data is in. For example, if you're training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.
+
+Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.
+
+Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in.
+
+```yaml
+{
+    "communication_data_type": "fp32"
+}
+```
+
+### Gradient accumulation
+
+Gradient accumulation accumulates gradients over several mini-batches of data before updating parameters. It stores less gradients and enables training with a larger *effective batch size*. Training speed is slower though, but it's useful for overcoming memory constraints.
+
+Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`.
+
+```yaml
+{
+    "gradient_accumulation_steps": "auto"
+}
+```
+
+### Gradient clipping
+
+Gradient clipping is useful for preventing exploding gradients which can lead to instability during training. It sets a maximum threshold value and rescales the gradients if their norm exceeds the threshold.
+
+Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`.
+
+```yaml
+{
+    "gradient_clipping": "auto"
+}
+```
+
+### Mixed precision training
+
+Mixed precision accelerates training speed by performing some calculations in half-precision, but it also maintains some calculations in full-precision to preserve accuracy. DeepSpeed supports fp32, fp16, and bf16 data types.
+
+<hfoptions id="precision">
+<hfoption id="fp32">
+
+Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.
+
+```yaml
+{
+    "fp16": {
+        "enabled": false
+    }
+}
+```
+
+For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it in [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it.
+
+</hfoption>
+<hfoption id="fp16">
+
+To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
+
+```yaml
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+For additional DeepSpeed fp16 training options, take a look at the [FP16 Training Options](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) reference.
+
+To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
+
+```yaml
+{
+    "amp": {
+        "enabled": "auto",
+        "opt_level": "auto"
+    }
+}
+```
+
+</hfoption>
+<hfoption id="bf16">
+
+> [!TIP]
+> bf16 requires DeepSpeed 0.6.0.
+
+bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
+
+bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
+
+```yaml
+{
+    "bf16": {
+        "enabled": "auto"
+    }
+}
+```
+
+</hfoption>
+</hfoptions>
+
+### Optimizer and scheduler
+
+DeepSpeed and Transformers optimizers and schedulers can be mixed and matched if `offload_optimizer` isn't enabled. When `offload_optimizer` is enabled, use a non-DeepSpeed optimizer (except for LAMB) as long as it has it a CPU and GPU implementation.
+
+Set the optimizer and scheduler parameters for the config file from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place, you can override it from the command line.
+
+<hfoptions id="opt-sched">
+<hfoption id="optimizer">
+
+DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`.
+
+You can set the parameters to `"auto"` or manually input your own values.
+
+```yaml
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": "auto",
+         "betas": "auto",
+         "eps": "auto",
+         "weight_decay": "auto"
+       }
+   }
+}
+```
+
+Use an unsupported optimizer by adding the following to the top level configuration.
+
+```yaml
+{
+   "zero_allow_untested_optimizer": true
+}
+```
+
+From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
+
+```yaml
+{
+   "zero_force_ds_cpu_optimizer": false
+}
+```
+
+</hfoption>
+<hfoption id="scheduler">
+
+DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
+
+Transformers and DeepSpeed provide two of the same schedulers:
+
+* WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers.
+* WarmupDecayLR is the same as  `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers).
+
+If you don't configure the scheduler in the config file, [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided).
+
+You can set the parameters to `"auto"` or manually input your own values.
+
+```yaml
+{
+   "scheduler": {
+         "type": "WarmupDecayLR",
+         "params": {
+             "total_num_steps": "auto",
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
+}
+```
+
+</hfoption>
+</hfoptions>
+
+### Universal checkpointing
+
+[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) saves and loads model, optimizer and training scheduler states across different model architectures, parallelism techniques, and training configurations. By saving them in a Universal format, it enables easier model training continuation and fine-tuning.
+
+Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file.
+
+```yaml
+{
+    "checkpoint": {
+        "load_universal": true
+    }
+}
+```
+
+## Deploy
+
+DeepSpeed can be deployed with its native launcher, [torchrun](https://pytorch.org/docs/stable/elastic/run.html) or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch).
+
+Add the `--deepspeed ds_config.json` argument to [`Trainer`] in the command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code.
+
+<hfoptions id="deploy">
+<hfoption id="multi-GPU">
+
+To deploy DeepSpeed on multiple GPUs, add `--num_gpus`. You don't need to add `--num_gpus` if you're planning on using all available GPUs.
+
+```bash
+deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero3.json \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+</hfoption>
+<hfoption id="single-GPU">
+
+DeepSpeed is still useful with just one GPU because you can:
+
+1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit.
+2. Minimize memory fragmentation with its smart GPU memory management system which also allows you to fit bigger models and data batches.
+
+To deploy DeepSpeed on a single GPU, add `--num_gpus`. You don't need to add `--num_gpus` if you only have one GPU because DeepSpeed deploys all GPUs it can see on a given node.
+
+> [!TIP]
+> Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU.
+
+```bash
+deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero2.json \
+--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+</hfoption>
+</hfoptions>
+
+### Multi-node
+
+A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem.
+
+```yaml
+{
+  "checkpoint": {
+    "use_node_local_storage": true
+  }
+}
+```
+
+You could also use the `--save_on_each_node` parameter in [`TrainingArguments`] to automatically add the above `checkpoint` to your config.
+
+The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password.
+
+<hfoptions id="multinode">
+<hfoption id="torchrun">
+
+With [torchrun](https://pytorch.org/docs/stable/elastic/run.html), ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.
+
+```bash
+torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
+--master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+</hfoption>
+<hfoption id="DeepSpeed">
+
+Create a `hostfile` for the DeepSpeed launcher.
+
+```bash
+hostname1 slots=8
+hostname2 slots=8
+```
+
+The DeepSpeed launcher automatically launches the command on both nodes at once with the command below.
+
+```bash
+deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \
+your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+Check out the [Resource Configuration (multi-node)](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) guide for more details about configuring multi-node compute resources.
+
+</hfoption>
+</hfoptions>
+
+### Slurm
+
+[Slurm](https://slurm.schedmd.com/documentation.html) is a cluster management and job scheduling system. An example Slurm script is shown below.
+
+```bash
+#SBATCH --job-name=test-nodes        # name
+#SBATCH --nodes=2                    # nodes
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=10           # number of cores per tasks
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+
+export GPUS_PER_NODE=8
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=9901
+
+srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
+ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
+ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+your_program.py <normal cl args> --deepspeed ds_config.json'
+```
+
+Launch training simultaneously on all nodes with the command below.
+
+```bash
+sbatch launch.slurm
+```
+
+### Jupyter Notebook
+
+To use DeepSpeed in a Jupyter Notebook, you need to emulate a distributed environment because the launcher doesn't support deployment from a notebook. This is only supported for one GPU. To use multiple GPUs, you must use a multi-process environment, which means you have to use the DeepSpeed launcher which can't be emulated as shown here.
+
+```py
+# emulate a launcher in the notebook
+import os
+
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+
+training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
+trainer = Trainer(...)
+trainer.train()
+```
+
+Create a config file on the fly in the notebook in the current directory with a dedicated cell.
+
+```py
+%%bash
+cat <<'EOT' > ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+EOT
+```
+
+If the training script is in a file and not a notebook cell, launch DeepSpeed from the shell in the notebook cell.
+
+```py
+!git clone https://github.com/huggingface/transformers
+!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+Another option is to use `%%bash` to run the shell program without emulating the distributed environment. However, you won't be able to view the logs until training is complete.
+
+```py
+%%bash
+
+git clone https://github.com/huggingface/transformers
+cd transformers
+deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+## Save model weights
+
+DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (`global_step*/*optim_states.pt`) which are saved under the normal checkpoint.
+
+### fp16
+
+ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set `"stage3_gather_16bit_weights_on_model_save": true` in the config file, because the weights are distributed across multiple GPUs.
+
+If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it.
+
+```yaml
+{
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+### fp32
+
+Unless you have a lot of free CPU memory, fp32 weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete.
+
+<hfoptions id="save">
+<hfoption id="offline">
+
+DeepSpeed provides a [zero_to_fp32.py](https://github.com/microsoft/DeepSpeed/blob/91829476a8fd4d0d9268c03c1d56795d20a51c12/deepspeed/utils/zero_to_fp32.py#L14) script at the top-level checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a config file or [`Trainer`].
+
+For example, if your checkpoint folder looks like the one shown below, then you can run the following command to create and consolidate the fp32 weights from multiple GPUs into a single `pytorch_model.bin` file. The script automatically discovers the subfolder `global_step1` which contains the checkpoint.
+
+```bash
+$ ls -l output_dir/checkpoint-1/
+-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+-rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+-rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+-rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+```
+
+> [!TIP]
+> Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights.
+
+```bash
+python zero_to_fp32.py . pytorch_model.bin
+```
+
+</hfoption>
+<hfoption id="online">
+
+Adding the `--load_best_model_at_end` parameter in [`TrainingArguments`] tracks the best checkpoint so you can finish training first and save the final model explicitly. Reload the model as shown below.
+
+> [!WARNING]
+> Once [load_state_dict_from_zero_checkpoint](https://deepspeed.readthedocs.io/en/stable/model-checkpointing.html#deepspeed.utils.zero_to_fp32.load_state_dict_from_zero_checkpoint) is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to reinitialize the DeepSpeed engine because `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this function once training is complete.
+
+```py
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+trainer.deepspeed.save_checkpoint(checkpoint_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+You must have saved at least one checkpoint to load the latest checkpoint as shown in the example below.
+
+```py
+from transformers.trainer_utils import get_last_checkpoint
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+Use `load_state_dict` to extract and load the state_dict of the fp32 weights.
+
+```py
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+
+state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
+model = model.cpu()
+model.load_state_dict(state_dict)
+```
+
+</hfoption>
+</hfoptions>
+
+## Non-Trainer integration
+
+DeepSpeed also works with Transformers without [`Trainer`]. The [`~integrations.HfDeepSpeedConfig`] is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when [`~PreTrainedModel.from_pretrained`] is called.
+
+You must instantiate [`~integrations.HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3.
+
+<hfoptions id="models">
+<hfoption id="pretrained model">
+
+```py
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel
+import deepspeed
+
+# DeepSpeed config object or path to the file
+ds_config = {...}
+# must run before instantiating the model to detect ZeRO-3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+model = AutoModel.from_pretrained("openai-community/gpt2")
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+</hfoption>
+<hfoption id="non-pretrained model">
+
+[`~integrations.HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2.
+
+```py
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel, AutoConfig
+import deepspeed
+
+# DeepSpeed config object or path to the file
+ds_config = {...}
+# must run before instantiating the model to detect zero 3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+# randomly initialize model weights
+config = AutoConfig.from_pretrained("openai-community/gpt2")
+model = AutoModel.from_config(config)
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+</hfoption>
+</hfoptions>
+
+## Troubleshoot
+
+One of the first things to check when you encounter an error is whether DeepSpeed is the cause (because often it isn't). Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed).
+
+For issues related to the Transformers integration, please provide the following information.
+
+* The full DeepSpeed config file.
+* The command line arguments for [`Trainer`] or the [`TrainingArguments`] if you're scripting the [`Trainer`] setup yourself (don't dump the entire [`TrainingArguments`] which contains many irrelevant entries).
+* The outputs of the following commands.
+
+    ```bash
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+    ```
+
+* A link to a Google Colab notebook to reproduce the issue.
+* A standard or non-custom dataset or an existing example to reproduce the issue.
+
+The following sections provide a guide for resolving two of the most common issues.
+
+### Process killed at startup
+
+When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to terminate the process.
+
+In this case, check whether your config file has either `offload_optimizer`, `offlload_param`, or both configured to offload to the CPU.
+
+If you have NVM3 and ZeRO-3 set up, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements of a model first) instead.
+
+### NaN loss
+
+NaN loss often occurs when a model is pretrained in bf16 and you try to use it with fp16 (especially relevant to TPU trained models). To resolve this, use fp32 or bf16 if your hardware (TPUs, Ampere GPUs or newer) supports it.
+
+It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.
+
+```yaml
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+The `OVERFLOW!` error below is a result of the DeepSpeed loss scaler unable to find a scaling coefficient to overcome the loss overflow. Try a higher `initial_scale_power` value in this case (32 usually works).
+
+```bash
+0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
+  1%|▌                                                                                                                    | 1/189 [00:00<01:26,  2.17it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
+  1%|█▏
+ [...]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 14%|████████████████▌                                                                                                   | 27/189 [00:14<01:13,  2.21it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▏                                                                                                  | 28/189 [00:14<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▊                                                                                                  | 29/189 [00:15<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+[...]
+```
+
+## Resources
+
+DeepSpeed is a powerful technology for scaling large model training. To learn more about DeepSpeed, take a look at their [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub](https://github.com/microsoft/deepspeed).
+
+The papers below provide additional details about ZeRO.
+
+* [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://hf.co/papers/1910.02054)
+* [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://hf.co/papers/2101.06840)
+* [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://hf.co/papers/2104.07857)
diff --git a/docs/transformers/docs/source/en/executorch.md b/docs/transformers/docs/source/en/executorch.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e9c097c9e6c89ed6a730574bca9b8012f9cdd80
--- /dev/null
+++ b/docs/transformers/docs/source/en/executorch.md
@@ -0,0 +1,59 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ExecuTorch
+
+[ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment.
+
+You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch.
+
+```py
+import torch
+from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig
+from transformers.integrations.executorch import(
+    TorchExportableModuleWithStaticCache,
+    convert_and_export_with_cache
+)
+
+generation_config = GenerationConfig(
+    use_cache=True,
+    cache_implementation="static",
+    cache_config={
+        "batch_size": 1,
+        "max_cache_len": 20,
+    }
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
+
+exported_program = convert_and_export_with_cache(model)
+```
+
+The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text.
+
+```py
+prompts = ["Simply put, the theory of relativity states that "]
+prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+prompt_token_ids = prompt_tokens["input_ids"]
+
+generated_ids = TorchExportableModuleWithStaticCache.generate(
+    exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20,
+)
+generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text)
+['Simply put, the theory of relativity states that 1) the speed of light is the']
+```
diff --git a/docs/transformers/docs/source/en/fast_tokenizers.md b/docs/transformers/docs/source/en/fast_tokenizers.md
new file mode 100644
index 0000000000000000000000000000000000000000..921c0ba7b6f588fe6896b1016d7e1c9f1a24c5e1
--- /dev/null
+++ b/docs/transformers/docs/source/en/fast_tokenizers.md
@@ -0,0 +1,362 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tokenizers
+
+Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. There are several tokenizer algorithms, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to.
+
+> [!TIP]
+> Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) doc.
+
+Call [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) or a local directory. The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files.
+
+Pass a string of text to the tokenizer to return the input ids and attention mask, and set the framework tensor type to return with the `return_tensors` parameter.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary from the pretrained models tokenizer.
+
+This guide provides a brief overview of the tokenizer classes and how to preprocess text with it.
+
+## Tokenizer classes
+
+All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. There are two main tokenizer classes that build on top of the base class.
+
+- [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`].
+- [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`].
+
+There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-specific tokenizer.
+
+<hfoptions id="tokenizer-classes">
+<hfoption id="AutoTokenizer">
+
+The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, [`AutoTokenizer`] tries to load a fast tokenizer if it's available, otherwise, it loads the Python implementation.
+
+Use [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+Load your own tokenizer by passing its vocabulary file to [`~AutoTokenizer.from_pretrained`].
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt")
+```
+
+</hfoption>
+<hfoption id="model-specific tokenizer">
+
+Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class.
+
+> [!TIP]
+> Refer to a models API documentation to check whether a fast tokenizer is supported.
+
+```py
+from transformers import GemmaTokenizer
+
+tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+```
+
+To load a fast tokenizer, use the fast implementation class.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+```
+
+Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter.
+
+```py
+from transformers import GemmaTokenizerFast
+
+tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")
+```
+
+</hfoption>
+</hfoptions>
+
+## Multimodal tokenizers
+
+In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access. 
+
+To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model.
+
+Save the tokenizer so you can reuse it with direct access to the `image_token`, `boi_token`, and `eoi_token`.
+
+```py
+vision_tokenizer = AutoTokenizer.from_pretrained(
+    "llava-hf/llava-1.5-7b-hf",
+    extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
+)
+print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
+("<image>", 32000)
+
+vision_tokenizer.save_pretrained("./path/to/tokenizer")
+```
+
+## Fast tokenizers
+
+<Youtube id="3umI3tm27Vw"/>
+
+[`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers.
+
+[`AutoTokenizer`] automatically loads a fast tokenizer if it's supported. Otherwise, you need to explicitly load the fast tokenizer.
+
+This section will show you how to train a fast tokenizer and reuse it in Transformers.
+
+To train a Byte-Pair Encoding (BPE) tokenizer, create a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] class and define the unknown token and special tokens.
+
+```py
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+```
+
+Split the tokens on [`~tokenizers.pre_tokenizers.Whitespace`] to create tokens that don't overlap with each other.
+
+```py
+from tokenizers.pre_tokenizers import Whitespace
+
+tokenizer.pre_tokenizer = Whitespace()
+```
+
+Call [`~tokenizers.Tokenizer.train`] on the text files and trainer to start training.
+
+```py
+files = [...]
+tokenizer.train(files, trainer)
+```
+
+Use [`~tokenizers.Tokenizer.save`] to save the tokenizers configuration and vocabulary to a JSON file.
+
+```py
+tokenizer.save("tokenizer.json")
+```
+
+Now you can load and reuse the tokenizer object in Transformers by passing it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+```
+
+To load a saved tokenizer from its JSON file, pass the file path to the `tokenizer_file` parameter in [`PreTrainedTokenizerFast`].
+
+```py
+from transformers import PreTrainedTokenizerFast
+
+fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+```
+
+## tiktoken
+
+[tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized.
+
+There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`].
+
+Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") 
+```
+
+### Create a tiktoken tokenizer
+
+The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]).
+
+Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8).
+
+```py
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# Load your custom encoding or the one provided by OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
+
+## Preprocess
+
+<Youtube id="Yffk5aydLzg"/>
+
+A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
+tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
+{'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
+         156808, 128149,   9581, 235265]]), 
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+}
+```
+
+The tokenization process of converting text into input ids is completed in two steps.
+
+<hfoptions id="steps">
+<hfoption id="1. tokenize">
+
+In the first step, a string of text is split into tokens by the [`~PreTrainedTokenizer.tokenize`] function. How the text is split depends on the tokenization algorithm.
+
+```py
+tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library")
+print(tokens)
+['We', '▁are', '▁very', '▁happy', '▁to', '▁show', '▁you', '▁the', '▁🤗', '▁Transformers', '▁library']
+```
+
+Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`.
+
+</hfoption>
+<hfoption id="2. convert tokens to ids">
+
+In the second step, the tokens are converted into ids with [`~PreTrainedTokenizer.convert_tokens_to_ids`].
+
+```py
+ids = tokenizer.convert_tokens_to_ids(tokens)
+print(ids)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+</hfoption>
+<hfoption id="3. decode ids to text">
+
+Lastly, the model prediction typically generates numerical outputs which are converted back to text with [`~PreTrainedTokenizer.decode`].
+
+```py
+decoded_string = tokenizer.decode(ids)
+print(decoded_string)
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+</hfoption>
+</hfoptions>
+
+> [!TIP]
+> Visualize how different tokenizers work in the [Tokenizer Playground](https://xenova-the-tokenizer-playground.static.hf.space).
+
+### Special tokens
+
+Special tokens provide the model with some additional information about the text.
+
+For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added.
+
+```py
+model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+[2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+tokenizer.convert_tokens_to_ids(tokens)
+[1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
+```
+
+When you [`~PreTrainedTokenizer.decode`] the ids, you'll see `<bos>` at the beginning of the string. This is used to indicate the beginning of a sentence to the model.
+
+```py
+print(tokenizer.decode(model_inputs["input_ids"]))
+print(tokenizer.decode(ids))
+'<bos>We are very happy to show you the 🤗 Transformers library.'
+'We are very happy to show you the 🤗 Transformers library'
+```
+
+Not all models need special tokens, but if they do, a tokenizer automatically adds them.
+
+### Batch tokenization
+
+It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization.
+
+Pass a list of string text to the tokenizer.
+
+```py
+batch_sentences = [
+    "But what about second breakfast?",
+    "Don't think he knows about second breakfast, Pip.",
+    "What about elevensies?",
+]
+encoded_inputs = tokenizer(batch_sentences, return_tensors="pt")
+print(encoded_inputs)
+{
+ 'input_ids': 
+    [[2, 1860, 1212, 1105, 2257, 14457, 235336], 
+     [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], 
+     [2, 1841, 1105, 29754, 37453, 235336]], 
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                    [1, 1, 1, 1, 1, 1]]
+}
+```
+
+### Padding
+
+> [!TIP]
+> Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide.
+
+In the output above, the `input_ids` have different lengths. This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched.
+
+Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
+print(encoded_inputs)
+```
+
+The tokenizer added the special padding token `0` to the left side (*left padding*) because Gemma and LLMs in general are not trained to continue generation from a padding token.
+
+### Truncation
+
+> [!TIP]
+> Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide.
+
+Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it crashes.
+
+Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. You can also set the maximum length yourself with the `max_length` parameter.
+
+```py
+encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt")
+print(encoded_inputs)
+```
diff --git a/docs/transformers/docs/source/en/feature_extractors.md b/docs/transformers/docs/source/en/feature_extractors.md
new file mode 100644
index 0000000000000000000000000000000000000000..6cc202057697773840dac7fd8b05ead2c22acc2e
--- /dev/null
+++ b/docs/transformers/docs/source/en/feature_extractors.md
@@ -0,0 +1,199 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Feature extractors
+
+Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling.
+
+Call [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) or local directory. The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file.
+
+Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_sample
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+The feature extractor returns an input, `input_values`, that is ready for the model to consume.
+
+This guide walks you through the feature extractor classes and how to preprocess audio data.
+
+## Feature extractor classes
+
+Transformers feature extractors inherit from the base [`SequenceFeatureExtractor`] class which subclasses [`FeatureExtractionMixin`].
+
+- [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths.
+- [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor.
+
+There are two ways you can load a feature extractor, [`AutoFeatureExtractor`] and a model-specific feature extractor class.
+
+<hfoptions id="feature-extractor-classes">
+<hfoption id="AutoFeatureExtractor">
+
+The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model.
+
+Use [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor.
+
+```py
+from transformers import AutoFeatureExtractor
+
+feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+<hfoption id="model-specific feature extractor">
+
+Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json).
+
+A feature extractor can be loaded directly from its model-specific class.
+
+```py
+from transformers import WhisperFeatureExtractor
+
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+```
+
+</hfoption>
+</hfoptions>
+
+## Preprocess
+
+A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using.
+
+For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` to be a tensor of shape `(batch_size, feature_size, sequence_length)` but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` to be a tensor of shape `(batch_size, sequence_length)`.
+
+The feature extractor generates the correct input shape for whichever audio model you're using.
+
+A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card.
+
+Load a dataset and feature extractor with [`~FeatureExtractionMixin.from_pretrained`].
+
+```py
+from datasets import load_dataset, Audio
+from transformers import AutoFeatureExtractor
+
+dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+Check out the first example from the dataset and access the `audio` column which contains `array`, the raw audio signal.
+
+```py
+dataset[0]["audio"]["array"]
+array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+        0.        ,  0.        ])
+```
+
+The feature extractor preprocesses `array` into the expected input format for a given audio model. Use the `sampling_rate` parameter to set the appropriate sampling rate.
+
+```py
+processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
+processed_dataset
+{'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
+       -2.8888427e-03,  9.4472744e-05,  9.4472744e-05], dtype=float32)]}
+```
+
+### Padding
+
+Audio sequence lengths that are different is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched.
+
+```py
+dataset[0]["audio"]["array"].shape
+(86699,)
+
+dataset[1]["audio"]["array"].shape
+(53248,)
+```
+
+Padding adds a special *padding token* to ensure all sequences have the same length. The feature extractor adds a `0` - interpreted as silence - to `array` to pad it. Set `padding=True` to pad sequences to the longest sequence length in the batch.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        padding=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(86699,)
+
+processed_dataset["input_values"][1].shape
+(86699,)
+```
+
+### Truncation
+
+Models can only process sequences up to a certain length before crashing.
+
+Truncation is a strategy for removing excess tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the length in the `max_length` parameter.
+
+```py
+def preprocess_function(examples):
+    audio_arrays = [x["array"] for x in examples["audio"]]
+    inputs = feature_extractor(
+        audio_arrays,
+        sampling_rate=16000,
+        max_length=50000,
+        truncation=True,
+    )
+    return inputs
+
+processed_dataset = preprocess_function(dataset[:5])
+processed_dataset["input_values"][0].shape
+(50000,)
+
+processed_dataset["input_values"][1].shape
+(50000,)
+```
+
+### Resampling
+
+The [Datasets](https://hf.co/docs/datasets/index) library can also resample audio data to match an audio models expected sampling rate. This method resamples the audio data on the fly when they're loaded which can be faster than resampling the entire dataset in-place.
+
+The audio dataset you've been working on has a sampling rate of 8kHz and the pretrained model expects 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ]),
+ 'sampling_rate': 8000}
+```
+
+Call [`~datasets.Dataset.cast_column`] on the `audio` column to upsample the sampling rate to 16kHz.
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+When you load the dataset sample, it is now resampled to 16kHz.
+
+```py
+dataset[0]["audio"]
+{'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'array': array([ 1.70562416e-05,  2.18727451e-04,  2.28099874e-04, ...,
+         3.43842403e-05, -5.96364771e-06, -1.76846661e-05]),
+ 'sampling_rate': 16000}
+```
diff --git a/docs/transformers/docs/source/en/generation_strategies.md b/docs/transformers/docs/source/en/generation_strategies.md
new file mode 100644
index 0000000000000000000000000000000000000000..55889f880b59e8397ed37825dc2c39b53195939e
--- /dev/null
+++ b/docs/transformers/docs/source/en/generation_strategies.md
@@ -0,0 +1,330 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generation strategies
+
+A decoding strategy informs how a model should select the next generated token. There are many types of decoding strategies, and choosing the appropriate one has a significant impact on the quality of the generated text.
+
+This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.
+
+## Greedy search
+
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.
+
+Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to default length because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=20)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
+```
+
+## Contrastive search
+
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+```
+
+## Beam search
+
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.
+
+> [!TIP]
+> Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
+
+Enable beam search with the `num_beams` parameter (should be greater than 1 otherwise it's equivalent to greedy search).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+"['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
+```
+
+## Diverse beam search
+
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
+```
+
+## Multinomial sampling
+
+Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
+
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
+```
+
+## Beam search multinomial sampling
+
+This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
+
+Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
+'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
+```
+
+## Speculative decoding
+
+[Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.
+
+Currently, only greedy search and multinomial sampling are supported with speculative decoding. Batched inputs aren't supported either.
+
+Enable speculative decoding with the `assistant_model` parameter. You'll notice the fastest speed up with an assistant model that is much smaller than the main model. Add `do_sample=True` to enable token validation with resampling.
+
+<hfoptions id="spec-decoding">
+<hfoption id="greedy search">
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt")
+
+outputs = model.generate(**inputs, assistant_model=assistant_model)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine'
+```
+
+Speculative decoding is also supported in [`Pipeline`] with the `assistant_model` parameter.
+
+```python
+from transformers import pipeline
+import torch
+
+pipe = pipeline(
+    "text-generation",
+    model="meta-llama/Llama-3.1-8B",
+    assistant_model="meta-llama/Llama-3.2-1B",
+    torch_dtype=torch.bfloat16
+)
+pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
+pipe_output[0]["generated_text"]
+```
+
+</hfoption>
+<hfoption id="multinomial sampling">
+
+Add the `temperature` parameter to control sampling randomness. For speculative decoding, a lower temperature may improve latency.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt")
+
+outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that is dedicated to creating a better world through technology.'
+```
+
+</hfoption>
+</hfoptions>
+
+### Prompt lookup decoding
+
+[Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.
+
+Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for developers to build and deploy machine learning models. It offers a variety of tools'
+```
+
+### Self-speculative decoding
+
+Early exiting uses the earlier hidden states from the language modeling head as inputs, effectively skipping layers to yield a lower quality output. The lower quality output is used as the assistant output and self-speculation is applied to fix the output using the remaining layers. The final generated result from this self-speculative method is the same (or has the same distribution) as the original models generation.
+
+The assistant model is also part of the target model, so the caches and weights can be shared, resulting in lower memory requirements.
+
+For a model trained with early exit, pass `assistant_early_exit` to [`~GenerationMixin.generate`].
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+prompt = "Alice and Bob"
+checkpoint = "facebook/layerskip-llama3.2-1B"
+
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+inputs = tokenizer(prompt, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+```
+
+### Universal assisted decoding
+
+Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).
+
+Re-encoding involves decoding token ids into text and encoding the text with a different tokenizer. To prevent tokenization discrepancies during re-encoding, UAD finds the longest common sub-sequence between the source and target encodings to ensure the new tokens include the correct prompt suffix.
+
+Add the `tokenizer` and `assistant_tokenizer` parameters to [`~GenerationMixin.generate`] to enable UAD.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+prompt = "Alice and Bob"
+
+assistant_tokenizer = AutoTokenizer.from_pretrained("double7/vicuna-68m")
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+inputs = tokenizer(prompt, return_tensors="pt")
+
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2-9b")
+assistant_model = AutoModelForCausalLM.from_pretrained("double7/vicuna-68m")
+outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+```
+
+## DoLa
+
+[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.
+
+Enable DoLa with the following parameters.
+
+- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string (`low` or `high`) to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA.
+
+  When a model has tied word embeddings, layer 0 is skipped and it begins from layer 2.
+
+  It can also be a list of integers that represent the layer indices between 0 and the total number of layers. Layer 0 is the word embedding, 1 is the first transformer layer, and so on. Refer to the table below for the range of layer indices depending on the number of model layers.
+
+  | layers | low | high |
+  |---|---|---|
+  | > 40 | (0, 20, 2) | (N - 20, N, 2) |
+  | <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) |
+
+- `repetition_penalty` reduces repetition and it is recommended to set it to 1.2.
+
+<hfoptions id="dola">
+<hfoption id="contrast higher layers">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda")
+
+outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak, lying almost 9.5 kilometers above the sea level and the tallest mountain from 19,036.91 ft. The mountain was"
+```
+
+</hfoption>
+<hfoption id="contrast specific layers">
+
+Contrast layers 18 and 20 with the final layer.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
+inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
+
+outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2)
+tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
+" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak above sea level and it rises to an incredible height of 29,028 feet above the ocean. Its summit is over a mile taller than Mt"
+```
+
+</hfoption>
+</hfoptions>
+
+## Resources
+
+Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
diff --git a/docs/transformers/docs/source/en/perf_hardware.md b/docs/transformers/docs/source/en/perf_hardware.md
new file mode 100644
index 0000000000000000000000000000000000000000..49ba739be28f73027378cfa545d902383fec1341
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_hardware.md
@@ -0,0 +1,73 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Build your own machine
+
+One of the most important consideration when building a machine for deep learning is the GPU choice. GPUs are the standard workhorse for deep learning owing to their tensor cores for performing very efficient matrix multiplication and high memory bandwidth. To train large models, you either need a more powerful GPU, multiple GPUs, or take advantage of techniques that offload some of the load to the CPU or NVMe.
+
+This guide provides some practical tips for setting up a GPU for deep learning. For a more detailed discussion and comparison of GPUs, take a look at the [Which GPU(s) to Get for Deep Learning](https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/) blog post.
+
+## Power
+
+High-end consumer GPUs may have two or three PCIe 8-pin power sockets, and you should make sure you have the same number of 12V PCIe 8-pin cables connected to each socket. Don't use a *pigtail cable*, a single cable with two splits at one end, to connect two sockets or else you won't get full performance from your GPU.
+
+Each PCIe 8-pin power cable should be connected to a 12V rail on the power supply unit (PSU) and can deliver up to 150W. Other GPUs may use a PCIe 12-pin connector which can deliver up to 500-600W. Lower-end GPUs may only use a PCIe 6-pin connector which supplies up to 75W.
+
+It is important the PSU has stable voltage otherwise it may not be able to supply the GPU with enough power to function properly during peak usage.
+
+## Cooling
+
+An overheated GPU throttles its performance and can even shutdown if it's too hot to prevent damage. Keeping the GPU temperature low, anywhere between 158 - 167F, is essential for delivering full performance and maintaining its lifespan. Once temperatures reach 183 - 194F, the GPU may begin to throttle performance.
+
+## Multi-GPU connectivity
+
+When your setup uses multiple GPUs, it is important to consider how they're connected. [NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) connections are faster than PCIe bridges, but you should also consider the [parallelism](./perf_train_gpu_many) strategy you're using. For example, in DistributedDataParallel, GPUs communicate less frequently compared to ZeRO-DP. In this case, a slower connection is not as important.
+
+Run the command below to check how your GPUs are connected.
+
+```bash
+nvidia-smi topo -m
+```
+
+<hfoptions id="nvlink">
+<hfoption id="NVLink">
+
+[NVLink](https://www.nvidia.com/en-us/design-visualization/nvlink-bridges/) is a high-speed communication system designed by NVIDIA for connecting multiple NVIDIA GPUs. Training [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) on a small sample of the [wikitext](https://huggingface.co/datasets/Salesforce/wikitext) dataset is ~23% faster with NVLink.
+
+On a machine with two GPUs connected with NVLink, an example output of `nvidia-smi topo -m` is shown below.
+
+```bash
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+`NV2` indicates `GPU0` and `GPU1` are connected by 2 NVLinks.
+
+</hfoption>
+<hfoption id="without NVLink">
+
+On a machine with two GPUs connected with a PCIe bridge, an example output of `nvidia-smi topo -m` is shown below.
+
+```bash
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+`PHB` indicates `GPU0` and `GPU1` are connected by a PCIe bridge.
+
+</hfoption>
+</hfoptions>
diff --git a/docs/transformers/docs/source/en/perf_infer_cpu.md b/docs/transformers/docs/source/en/perf_infer_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..7522a013d0d911811b2505091a2915080ab9f06b
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_infer_cpu.md
@@ -0,0 +1,103 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU
+
+CPUs are a viable and cost-effective inference option. With a few optimization methods, it is possible to achieve good performance with large models on CPUs. These methods include fusing kernels to reduce overhead and compiling your code to a faster intermediate format that can be deployed in production environments.
+
+This guide will show you a few ways to optimize inference on a CPU.
+
+## Optimum
+
+[Optimum](https://hf.co/docs/optimum/en/index) is a Hugging Face library focused on optimizing model performance across various hardware. It supports [ONNX Runtime](https://onnxruntime.ai/docs/) (ORT), a model accelerator, for a wide range of hardware and frameworks including CPUs.
+
+Optimum provides the [`~optimum.onnxruntime.ORTModel`] class for loading ONNX models. For example, load the [optimum/roberta-base-squad2](https://hf.co/optimum/roberta-base-squad2) checkpoint for question answering inference. This checkpoint contains a [model.onnx](https://hf.co/optimum/roberta-base-squad2/blob/main/model.onnx) file.
+
+```py
+from transformers import AutoTokenizer, pipeline
+from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+onnx_qa = pipeline("question-answering", model="optimum/roberta-base-squad2", tokenizer="deepset/roberta-base-squad2")
+
+question = "What's my name?"
+context = "My name is Philipp and I live in Nuremberg."
+pred = onnx_qa(question, context)
+```
+
+> [!TIP]
+> Optimum includes an [Intel](https://hf.co/docs/optimum/intel/index) extension that provides additional optimizations such as quantization, pruning, and knowledge distillation for Intel CPUs. This extension also includes tools to convert models to [OpenVINO](https://hf.co/docs/optimum/intel/inference), a toolkit for optimizing and deploying models, for even faster inference.
+
+### BetterTransformer
+
+[BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) is a *fastpath* execution of specialized Transformers functions directly on the hardware level such as a CPU. There are two main components of the fastpath execution.
+
+- fusing multiple operations into a single kernel for faster and more efficient execution
+- skipping unnecessary computation of padding tokens with nested tensors
+
+> [!WARNING]
+> BetterTransformer isn't supported for all models. Check this [list](https://hf.co/docs/optimum/bettertransformer/overview#supported-models) to see whether a model supports BetterTransformer.
+
+BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`].
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom")
+model = model.to_bettertransformer()
+```
+
+## TorchScript
+
+[TorchScript](https://pytorch.org/docs/stable/jit.html) is an intermediate PyTorch model format that can be run in non-Python environments, like C++, where performance is critical. Train a PyTorch model and convert it to a TorchScript function or module with [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html). This function optimizes the model with just-in-time (JIT) compilation, and compared to the default eager mode, JIT-compiled models offer better inference performance.
+
+> [!TIP]
+> Refer to the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial for a gentle introduction to TorchScript.
+
+On a CPU, enable `torch.jit.trace` with the `--jit_mode_eval` flag in [`Trainer`].
+
+```bash
+python examples/pytorch/question-answering/run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--jit_mode_eval
+```
+
+## IPEX
+
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html) (IPEX) offers additional optimizations for PyTorch on Intel CPUs. IPEX further optimizes TorchScript with [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) which fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more, into single kernels for faster execution.
+
+Make sure IPEX is installed, and set the `--use_opex` and `--jit_mode_eval` flags in [`Trainer`] to enable IPEX graph optimization and TorchScript.
+
+```bash
+!pip install intel_extension_for_pytorch
+```
+
+```bash
+python examples/pytorch/question-answering/run_qa.py \
+--model_name_or_path csarron/bert-base-uncased-squad-v1 \
+--dataset_name squad \
+--do_eval \
+--max_seq_length 384 \
+--doc_stride 128 \
+--output_dir /tmp/ \
+--no_cuda \
+--use_ipex \
+--jit_mode_eval
+```
diff --git a/docs/transformers/docs/source/en/perf_infer_gpu_multi.md b/docs/transformers/docs/source/en/perf_infer_gpu_multi.md
new file mode 100644
index 0000000000000000000000000000000000000000..3aa1f09be55b49eb8f965f7c615e04b374c3a7bf
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_infer_gpu_multi.md
@@ -0,0 +1,82 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Distributed GPU inference
+
+[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
+
+> [!TIP]
+> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.
+
+<details>
+<summary>Supported models</summary>
+
+* [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2)
+* [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2)
+* [GLM](./model_doc/glm)
+* [Granite](./model_doc/granite)
+* [Llama](./model_doc/llama)
+* [Mistral](./model_doc/mistral)
+* [Mixtral](./model_doc/mixtral)
+* [OLMo](./model_doc/olmo) and [OLMo2](./model_doc/olmo2)
+* [Phi](./model_doc/phi) and [Phi-3](./model_doc/phi3)
+* [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), and [Qwen2-VL](./model_doc/qwen2_5_vl)
+* [Starcoder2](./model_doc/starcoder2)
+
+</details>
+
+Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference.
+
+```py
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# enable tensor parallelism
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    tp_plan="auto",
+)
+
+# prepare input tokens
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+
+# distributed run
+outputs = model(inputs)
+```
+
+Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
+
+```bash
+torchrun --nproc-per-node 4 demo.py
+```
+
+For CPU, please binding different socket on each rank. For example, if you are using Intel 4th Gen Xeon:
+```bash
+export OMP_NUM_THREADS=56
+numactl -C 0-55 -m 0 torchrun --nnodes=2 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & numactl -C 56-111 -m 1 torchrun --nnodes=2 --node_rank=1 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & wait
+```
+The CPU benchmark data will be released soon.
+
+You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.
+
+For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
+</div>
diff --git a/docs/transformers/docs/source/en/perf_infer_gpu_one.md b/docs/transformers/docs/source/en/perf_infer_gpu_one.md
new file mode 100644
index 0000000000000000000000000000000000000000..35297d332caf32c7099e7c7b486d683d55314707
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_infer_gpu_one.md
@@ -0,0 +1,291 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GPU
+
+GPUs are the standard hardware for machine learning because they're optimized for memory bandwidth and parallelism. With the increasing sizes of modern models, it's more important than ever to make sure GPUs are capable of efficiently handling and delivering the best possible performance.
+
+This guide will demonstrate a few ways to optimize inference on a GPU. The optimization methods shown below can be combined with each other to achieve even better performance, and they also work for distributed GPUs.
+
+## bitsandbytes
+
+[bitsandbytes](https://hf.co/docs/bitsandbytes/index) is a quantization library that supports 8-bit and 4-bit quantization. Quantization represents weights in a lower precision compared to the original full precision format. It reduces memory requirements and makes it easier to fit large model into memory.
+
+Make sure bitsandbytes and Accelerate are installed first.
+
+```bash
+pip install bitsandbytes accelerate
+```
+
+<hfoptions id="bnb">
+<hfoption id="8-bit">
+
+For text generation with 8-bit quantization, you should use [`~GenerationMixin.generate`] instead of the high-level [`Pipeline`] API. The [`Pipeline`] returns slower performance because it isn't optimized for 8-bit models, and some sampling strategies (nucleus sampling) also aren't supported.
+
+Set up a [`BitsAndBytesConfig`] and set `load_in_8bit=True` to load a model in 8-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`].
+
+Allow Accelerate to automatically distribute the model across your available hardware by setting [device_map="auto"](https://hf.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map).
+
+Place all inputs on the same device as the model.
+
+```py
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
+
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
+
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+For distributed setups, use the `max_memory` parameter to create a mapping of the amount of memory to allocate to each GPU. The example below distributes 16GB of memory to the first GPU and 16GB of memory to the second GPU.
+
+```py
+max_memory_mapping = {0: "16GB", 1: "16GB"}
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config, max_memory=max_memory_mapping
+)
+```
+
+Learn in more detail the concepts underlying 8-bit quantization in the [Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://hf.co/blog/hf-bitsandbytes-integration) blog post.
+
+</hfoption>
+<hfoption id="4-bit">
+
+Set up a [`BitsAndBytesConfig`] and set `load_in_4bit=True` to load a model in 4-bit precision. The [`BitsAndBytesConfig`] is passed to the `quantization_config` parameter in [`~PreTrainedModel.from_pretrained`].
+
+Allow Accelerate to automatically distribute the model across your available hardware by setting `device_map=“auto”`.
+
+Place all inputs on the same device as the model.
+
+```py
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)
+
+prompt = "Hello, my llama is cute"
+inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
+generated_ids = model_8bit.generate(**inputs)
+outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+```
+
+For distributed setups, use the `max_memory` parameter to create a mapping of the amount of memory to allocate to each GPU. The example below distributes 16GB of memory to the first GPU and 16GB of memory to the second GPU.
+
+```py
+max_memory_mapping = {0: "16GB", 1: "16GB"}
+model_4bit = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config, max_memory=max_memory_mapping
+)
+```
+
+</hfoption>
+</hfoptions>
+
+## Optimum
+
+[Optimum](https://hf.co/docs/optimum/en/index) is a Hugging Face library focused on optimizing model performance across various hardware. It supports [ONNX Runtime](https://onnxruntime.ai/docs/) (ORT), a model accelerator, for a wide range of hardware and frameworks including NVIDIA GPUs and AMD GPUs that use the [ROCm](https://www.amd.com/en/products/software/rocm.html) stack.
+
+ORT uses optimization techniques that fuse common operations into a single node and constant folding to reduce the number of computations. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
+
+Optimum provides the [`~optimum.onnxruntime.ORTModel`] class for loading ONNX models. Set the `provider` parameter according to the table below.
+
+| provider | hardware |
+|---|---|
+| [CUDAExecutionProvider](https://hf.co/docs/optimum/main/en/onnxruntime/usage_guides/gpu#cudaexecutionprovider) | CUDA-enabled GPUs |
+| [ROCMExecutionProvider](https://hf.co/docs/optimum/onnxruntime/usage_guides/amdgpu) | AMD Instinct, Radeon Pro, Radeon GPUs |
+| [TensorrtExecutionProvider](https://hf.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider) | TensorRT |
+
+For example, load the [distilbert/distilbert-base-uncased-finetuned-sst-2-english](https://hf.co/optimum/roberta-base-squad2) checkpoint for sequence classification. This checkpoint contains a [model.onnx](https://hf.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english/blob/main/onnx/model.onnx) file. If a checkpoint doesn't have a `model.onnx` file, set `export=True` to convert a checkpoint on the fly to the ONNX format.
+
+```py
+from optimum.onnxruntime import ORTModelForSequenceClassification
+
+ort_model = ORTModelForSequenceClassification.from_pretrained(
+  "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
+  #export=True,
+  provider="CUDAExecutionProvider",
+)
+```
+
+Now you can use the model for inference in a [`Pipeline`].
+
+```py
+from optimum.pipelines import pipeline
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
+result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
+```
+
+Learn more details about using ORT with Optimum in the [Accelerated inference on NVIDIA GPUs](https://hf.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) and [Accelerated inference on AMD GPUs](https://hf.co/docs/optimum/onnxruntime/usage_guides/amdgpu#accelerated-inference-on-amd-gpus) guides.
+
+### BetterTransformer
+
+[BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) is a *fastpath* execution of specialized Transformers functions directly on the hardware level such as a GPU. There are two main components of the fastpath execution.
+
+- fusing multiple operations into a single kernel for faster and more efficient execution
+- skipping unnecessary computation of padding tokens with nested tensors
+
+> [!WARNING]
+> Some BetterTransformer features are being upstreamed to Transformers with default support for native [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA). BetterTransformer has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers.
+
+BetterTransformer is available through Optimum with [`~PreTrainedModel.to_bettertransformer`].
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom")
+model = model.to_bettertransformer()
+```
+
+Call [`~PreTrainedModel.reverse_bettertransformer`] and save it first to return the model to the original Transformers model.
+
+```py
+model = model.reverse_bettertransformer()
+model.save_pretrained("saved_model")
+```
+
+Refer to the benchmarks in [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) for BetterTransformer and scaled dot product attention performance. The [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post also discusses fastpath execution in greater detail if you're interested in learning more.
+
+## Scaled dot product attention (SDPA)
+
+PyTorch's [torch.nn.functional.scaled_dot_product_attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) is a native implementation of the scaled dot product attention mechanism. SDPA is a more efficient and optimized version of the attention mechanism used in transformer models.
+
+There are three supported implementations available.
+
+- [FlashAttention2](https://github.com/Dao-AILab/flash-attention) only supports models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate type first.
+- [xFormers](https://github.com/facebookresearch/xformers) or Memory-Efficient Attention is able to support models with the fp32 torch type.
+- C++ implementation of scaled dot product attention
+
+SDPA is used by default for PyTorch v2.1.1. and greater when an implementation is available. You could explicitly enable SDPA by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`] though. Certain attention parameters, such as `head_mask` and `output_attentions=True`, are unsupported and returns a warning that Transformers will fall back to the (slower) eager implementation.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
+```
+
+SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`.
+
+```py
+import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto").to("cuda")
+
+input_text = "Hello, my llama is cute"
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
+    outputs = model.generate(**inputs)
+
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+If you encounter the following `RuntimeError`, try installing the nightly version of PyTorch which has broader coverage for FlashAttention.
+
+```bash
+RuntimeError: No available kernel. Aborting execution.
+
+pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+```
+
+## FlashAttention
+
+[FlashAttention](https://github.com/Dao-AILab/flash-attention) is also available as a standalone package. It can significantly speed up inference by:
+
+1. additionally parallelizing the attention computation over sequence length
+2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
+
+Install FlashAttention first for the hardware you're using.
+
+<hfoptions id="install">
+<hfoption id="NVIDIA">
+
+```bash
+pip install flash-attn --no-build-isolation
+```
+
+</hfoption>
+<hfoption id="AMD">
+
+FlashAttention2 support is currently limited to Instinct MI210, Instinct MI250 and Instinct MI300. We strongly suggest running this [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) for FlashAttention2 on AMD GPUs.
+
+</hfoption>
+</hfoptions>
+
+Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2")
+```
+
+### Benchmarks
+
+FlashAttention2 speeds up inference considerably especially for inputs with long sequences. However, since FlashAttention2 doesn't support computing attention scores with padding tokens, you must manually pad and unpad the attention scores for batched inference if a sequence contains padding tokens. The downside is batched generation is slower with padding tokens.
+
+<hfoptions id="padded">
+<hfoption id="short sequence length">
+
+With a relatively small sequence length, a single forward pass creates overhead leading to a small speed up. The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with padding.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png"/>
+</div>
+
+</hfoption>
+<hfoption id="long sequence length">
+
+You can train on much longer sequence lengths without running into out-of-memory issues with FlashAttention2, and potentially reduce memory usage up to 20x. The speed up benefits are even better. The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with padding on a longer sequence length.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
+
+To avoid this slowdown, use FlashAttention2 without padding tokens in the sequence during training. Pack the dataset or concatenate sequences until reaching the maximum sequence length.
+
+<hfoptions id="not-padded">
+<hfoption id="tiiuae/falcon-7b">
+
+The graph below shows the expected speed up for a single forward pass with [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png"/>
+</div>
+
+</hfoption>
+<hfoption id="meta-llama/Llama-7b-hf">
+
+The graph below shows the expected speed up for a single forward pass with [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes without padding tokens.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
diff --git a/docs/transformers/docs/source/en/perf_torch_compile.md b/docs/transformers/docs/source/en/perf_torch_compile.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7bef363e03936fd76bbe9253384a2eb80cfc3ca
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_torch_compile.md
@@ -0,0 +1,71 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# torch.compile
+
+[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) compiles PyTorch code into optimized kernels that significantly speed up inference. This feature relies on [TorchDynamo](https://pytorch.org/docs/stable/torch.compiler_dynamo_overview.html) to compile the code into graphs and [TorchInductor](https://dev-discuss.pytorch.org/t/torchinductor-a-pytorch-native-compiler-with-define-by-run-ir-and-symbolic-shapes/747) to further compile the graphs into optimized kernels. It is a powerful optimization tool, and in many cases, only requires adding a single line of code.
+
+Wrap a model with torch.compile to compile and return an optimized model.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model)
+```
+
+> [!TIP]
+> The initial call to torch.compile is slow because the model needs to be compiled. Subsequent calls to the compiled model are much faster because it doesn't need to compile again.
+
+There are several parameters to customize the compilation process. Two of the more important ones are listed below. For a full list of parameters, refer to the torch.compile [documentation](https://pytorch.org/docs/stable/generated/torch.compile.html).
+
+## Modes
+
+The `mode` parameter offers several performance options for compiling. Try different modes to see which one works best for your use case.
+
+- `default` is a balanced option between speed and memory.
+- `reduce-overhead` reduces the Python overhead at the expense of a little more memory, but it can be faster.
+- `max-autotune` offers the fastest speed, but compilation takes longer.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model, mode="reduce-overhead")
+```
+
+## Fullgraph
+
+Fullgraph attempts to compile the entire model into a single graph to maximize performance. torch.compile raises an error if it encounters a graph break, which means it can't compile the model into a single graph.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+```
+
+## Benchmarks
+
+Refer to the table below for performance benchmarks comparing the mean inference time in milliseconds with torch.compile enabled and disabled across various GPUs and batch sizes on the same image for different vision tasks.
+
+Select **Subset** in the table below to switch between different GPUs, as well as benchmarks on [PyTorch nightly](https://download.pytorch.org/whl/nightly/cu118) 2.1.0dev and torch.compile with `reduce-overhead` mode enabled.
+
+<iframe
+  src="https://huggingface.co/datasets/stevhliu/compile-benchmarks/embed/viewer/t4/train"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>
diff --git a/docs/transformers/docs/source/en/perf_train_cpu.md b/docs/transformers/docs/source/en/perf_train_cpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..1eab6afbde23d89d932e63a6cf27f1337ee2119f
--- /dev/null
+++ b/docs/transformers/docs/source/en/perf_train_cpu.md
@@ -0,0 +1,75 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPU
+
+A modern CPU is capable of efficiently training large models by leveraging the underlying optimizations built into the hardware and training on fp16 or bf16 data types.
+
+This guide focuses on how to train large models on an Intel CPU using mixed precision and the [Intel Extension for PyTorch (IPEX)](https://intel.github.io/intel-extension-for-pytorch/index.html) library.
+
+You can Find your PyTorch version by running the command below.
+
+```bash
+pip list | grep torch
+```
+
+Install IPEX with the PyTorch version from above.
+
+```bash
+pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
+```
+
+> [!TIP]
+> Refer to the IPEX [installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation) guide for more details.
+
+IPEX provides additional performance optimizations for Intel CPUs. These include additional CPU instruction level architecture (ISA) support such as [Intel AVX512-VNNI](https://en.wikichip.org/wiki/x86/avx512_vnni) and [Intel AMX](https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/what-is-intel-amx.html). Both of these features are designed to accelerate matrix multiplication. Older AMD and Intel CPUs with only Intel AVX2, however, aren't guaranteed better performance with IPEX.
+
+IPEX also supports [Auto Mixed Precision (AMP)](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/amp.html) training with the fp16 and bf16 data types. Reducing precision speeds up training and reduces memory usage because it requires less computation. The loss in accuracy from using full-precision is minimal. 3rd, 4th, and 5th generation Intel Xeon Scalable processors natively support bf16, and the 6th generation processor also natively supports fp16 in addition to bf16.
+
+AMP is enabled for CPU backends training with PyTorch.
+
+[`Trainer`] supports AMP training with a CPU by adding the `--use_cpu`, `--use_ipex`, and `--bf16` parameters. The example below demonstrates the [run_qa.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) script.
+
+```bash
+python run_qa.py \
+ --model_name_or_path google-bert/bert-base-uncased \
+ --dataset_name squad \
+ --do_train \
+ --do_eval \
+ --per_device_train_batch_size 12 \
+ --learning_rate 3e-5 \
+ --num_train_epochs 2 \
+ --max_seq_length 384 \
+ --doc_stride 128 \
+ --output_dir /tmp/debug_squad/ \
+ --use_ipex \
+ --bf16 \
+ --use_cpu
+```
+
+These parameters can also be added to [`TrainingArguments`] as shown below.
+
+```py
+training_args = TrainingArguments(
+    output_dir="./outputs",
+    bf16=True,
+    use_ipex=True,
+    use_cpu=True,
+)
+```
+
+## Resources
+
+Learn more about training on Intel CPUs in the [Accelerating PyTorch Transformers with Intel Sapphire Rapids](https://huggingface.co/blog/intel-sapphire-rapids) blog post.
diff --git a/docs/transformers/docs/source/en/processors.md b/docs/transformers/docs/source/en/processors.md
new file mode 100644
index 0000000000000000000000000000000000000000..7bd6be74f5b11ed9dc756ae1373d7bacd6cdc037
--- /dev/null
+++ b/docs/transformers/docs/source/en/processors.md
@@ -0,0 +1,129 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Processors
+
+Multimodal models require a preprocessor capable of handling inputs that combine more than one modality. Depending on the input modality, a processor needs to convert text into an array of tensors, images into pixel values, and audio into an array with tensors with the correct sampling rate.
+
+For example, [PaliGemma](./model_doc/paligemma) is a vision-language model that uses the [SigLIP](./model_doc/siglip) image processor and the [Llama](./model_doc/llama) tokenizer. A [`ProcessorMixin`] class wraps both of these preprocessor types, providing a single and unified processor class for a multimodal model.
+
+Call [`~ProcessorMixin.from_pretrained`] to load a processor. Pass the input type to the processor to generate the expected model inputs, input ids and pixel values.
+
+```py
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+from PIL import Image
+import requests
+
+processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+
+prompt = "answer en Where is the cat standing?"
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+inputs
+```
+
+This guide describes the processor class and how to preprocess multimodal inputs.
+
+## Processor classes
+
+All processors inherit from the [`ProcessorMixin`] class which provides methods like [`~ProcessorMixin.from_pretrained`], [`~ProcessorMixin.save_pretrained`], and [`~ProcessorMixin.push_to_hub`] for loading, saving, and sharing processors to the Hub.
+
+There are two ways to load a processor, with an [`AutoProcessor`] and with a model-specific processor class.
+
+<hfoptions id="processor-class">
+<hfoption id="AutoProcessor">
+
+The [AutoClass](./model_doc/auto) API provides a simple interface to load processors without directly specifying the specific model class it belongs to.
+
+Use [`~AutoProcessor.from_pretrained`] to load a processor.
+
+```py
+from transformers import AutoProcessor
+
+processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+```
+
+</hfoption>
+<hfoption id="model-specific processor">
+
+Processors are also associated with a specific pretrained multimodal model class. You can load a processor directly from the model class with [`~ProcessorMixin.from_pretrained`].
+
+```py
+from transformers import WhisperProcessor
+
+processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
+```
+
+You could also separately load the two preprocessor types, [`WhisperTokenizerFast`] and [`WhisperFeatureExtractor`].
+
+```py
+from transformers import WhisperTokenizerFast, WhisperFeatureExtractor, WhisperProcessor
+
+tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny")
+feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
+processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+</hfoption>
+</hfoptions>
+
+## Preprocess
+
+Processors preprocess multimodal inputs into the expected Transformers format. There are a couple combinations of input modalities that a processor can handle such as text and audio or text and image.
+
+Automatic speech recognition (ASR) tasks require a processor that can handle text and audio inputs. Load a dataset and take a look at the `audio` and `text` columns (you can remove the other columns which aren't needed).
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("lj_speech", split="train")
+dataset = dataset.map(remove_columns=["file", "id", "normalized_text"])
+dataset[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+dataset[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+Remember to resample the sampling rate to match the pretrained models required sampling rate.
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+Load a processor and pass the audio `array` and `text` columns to it.
+
+```py
+from transformers import AutoProcessor
+
+processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
+
+def prepare_dataset(example):
+    audio = example["audio"]
+    example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+    return example
+```
+
+Apply the `prepare_dataset` function to preprocess the dataset. The processor returns `input_features` for the `audio` column and `labels` for the text column.
+
+```py
+prepare_dataset(dataset[0])
+```
diff --git a/docs/transformers/docs/source/en/quicktour.md b/docs/transformers/docs/source/en/quicktour.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b6b6869bfa4cbebd85273bb44da9237085ff115
--- /dev/null
+++ b/docs/transformers/docs/source/en/quicktour.md
@@ -0,0 +1,338 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quickstart
+
+[[open-in-colab]]
+
+Transformers is designed to be fast and easy to use so that everyone can start learning or building with transformer models.
+
+The number of user-facing abstractions is limited to only three classes for instantiating a model, and two APIs for inference or training. This quickstart introduces you to Transformers' key features and shows you how to:
+
+- load a pretrained model
+- run inference with [`Pipeline`]
+- fine-tune a model with [`Trainer`]
+
+## Set up
+
+To start, we recommend creating a Hugging Face [account](https://hf.co/join). An account lets you host and access version controlled models, datasets, and [Spaces](https://hf.co/spaces) on the Hugging Face [Hub](https://hf.co/docs/hub/index), a collaborative platform for discovery and building.
+
+Create a [User Access Token](https://hf.co/docs/hub/security-tokens#user-access-tokens) and log in to your account.
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+Install a machine learning framework.
+
+<hfoptions id="installation">
+<hfoption id="PyTorch">
+
+```bash
+!pip install torch
+```
+
+</hfoption>
+<hfoption id="TensorFlow">
+
+```bash
+!pip install tensorflow
+```
+
+</hfoption>
+</hfoptions>
+
+Then install an up-to-date version of Transformers and some additional libraries from the Hugging Face ecosystem for accessing datasets and vision models, evaluating training, and optimizing training for large models.
+
+```bash
+!pip install -U transformers datasets evaluate accelerate timm
+```
+
+## Pretrained models
+
+Each pretrained model inherits from three base classes.
+
+| **Class** | **Description** |
+|---|---|
+| [`PretrainedConfig`] | A file that specifies a models attributes such as the number of attention heads or vocabulary size. |
+| [`PreTrainedModel`] | A model (or architecture) defined by the model attributes from the configuration file. A pretrained model only returns the raw hidden states. For a specific task, use the appropriate model head to convert the raw hidden states into a meaningful result (for example, [`LlamaModel`] versus [`LlamaForCausalLM`]). |
+| Preprocessor | A class for converting raw inputs (text, images, audio, multimodal) into numerical inputs to the model. For example, [`PreTrainedTokenizer`] converts text into tensors and [`ImageProcessingMixin`] converts pixels into tensors. |
+
+We recommend using the [AutoClass](./model_doc/auto) API to load models and preprocessors because it automatically infers the appropriate architecture for each task and machine learning framework based on the name or path to the pretrained weights and configuration file.
+
+Use [`~PreTrainedModel.from_pretrained`] to load the weights and configuration file from the Hub into the model and preprocessor class.
+
+<hfoptions id="base-classes">
+<hfoption id="PyTorch">
+
+When you load a model, configure the following parameters to ensure the model is optimally loaded.
+
+- `device_map="auto"` automatically allocates the model weights to your fastest device first, which is typically the GPU.
+- `torch_dtype="auto"` directly initializes the model weights in the data type they're stored in, which can help avoid loading the weights twice (PyTorch loads weights in `torch.float32` by default).
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype="auto", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+```
+
+Tokenize the text and return PyTorch tensors with the tokenizer. Move the model to a GPU if it's available to accelerate inference.
+
+```py
+model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="pt").to("cuda")
+```
+
+The model is now ready for inference or training.
+
+For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`].
+
+```py
+generated_ids = model.generate(**model_inputs, max_length=30)
+tokenizer.batch_decode(generated_ids)[0]
+'<s> The secret to baking a good cake is 100% in the preparation. There are so many recipes out there,'
+```
+
+</hfoption>
+<hfoption id="TensorFlow">
+
+```py
+from transformers import TFAutoModelForCausalLM, AutoTokenizer
+
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2-xl")
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
+```
+
+Tokenize the text and return TensorFlow tensors with the tokenizer.
+
+```py
+model_inputs = tokenizer(["The secret to baking a good cake is "], return_tensors="tf")
+```
+
+The model is now ready for inference or training.
+
+For inference, pass the tokenized inputs to [`~GenerationMixin.generate`] to generate text. Decode the token ids back into text with [`~PreTrainedTokenizerBase.batch_decode`].
+
+```py
+generated_ids = model.generate(**model_inputs, max_length=30)
+tokenizer.batch_decode(generated_ids)[0]
+'The secret to baking a good cake is \xa0to use the right ingredients. \xa0The secret to baking a good cake is to use the right'
+```
+
+</hfoption>
+</hfoptions>
+
+> [!TIP]
+> Skip ahead to the [Trainer](#trainer-api) section to learn how to fine-tune a model.
+
+## Pipeline
+
+The [`Pipeline`] class is the most convenient way to inference with a pretrained model. It supports many tasks such as text generation, image segmentation, automatic speech recognition, document question answering, and more.
+
+> [!TIP]
+> Refer to the [Pipeline](./main_classes/pipelines) API reference for a complete list of available tasks.
+
+Create a [`Pipeline`] object and select a task. By default, [`Pipeline`] downloads and caches a default pretrained model for a given task. Pass the model name to the `model` parameter to choose a specific model.
+
+<hfoptions id="pipeline-tasks">
+<hfoption id="text generation">
+
+Set `device="cuda"` to accelerate inference with a GPU.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline("text-generation", model="meta-llama/Llama-2-7b-hf", device="cuda")
+```
+
+Prompt [`Pipeline`] with some initial text to generate more text.
+
+```py
+pipeline("The secret to baking a good cake is ", max_length=50)
+[{'generated_text': 'The secret to baking a good cake is 100% in the batter. The secret to a great cake is the icing.\nThis is why we’ve created the best buttercream frosting reci'}]
+```
+
+</hfoption>
+<hfoption id="image segmentation">
+
+Set `device="cuda"` to accelerate inference with a GPU.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device="cuda")
+```
+
+Pass an image - a URL or local path to the image - to [`Pipeline`].
+
+<div class="flex justify-center">
+   <img src="https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"/>
+</div>
+
+```py
+segments = pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+segments[0]["label"]
+'bird'
+segments[1]["label"]
+'bird'
+```
+
+</hfoption>
+<hfoption id="automatic speech recognition">
+
+Set `device="cuda"` to accelerate inference with a GPU.
+
+```py
+from transformers import pipeline
+
+pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device="cuda")
+```
+
+Pass an audio file to [`Pipeline`].
+
+```py
+pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+{'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+```
+
+</hfoption>
+</hfoptions>
+
+## Trainer
+
+[`Trainer`] is a complete training and evaluation loop for PyTorch models. It abstracts away a lot of the boilerplate usually involved in manually writing a training loop, so you can start training faster and focus on training design choices. You only need a model, dataset, a preprocessor, and a data collator to build batches of data from the dataset.
+
+Use the [`TrainingArguments`] class to customize the training process. It provides many options for training, evaluation, and more. Experiment with training hyperparameters and features like batch size, learning rate, mixed precision, torch.compile, and more to meet your training needs. You could also use the default training parameters to quickly produce a baseline.
+
+Load a model, tokenizer, and dataset for training.
+
+```py
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from datasets import load_dataset
+
+model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+dataset = load_dataset("rotten_tomatoes")
+```
+
+Create a function to tokenize the text and convert it into PyTorch tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method.
+
+```py
+def tokenize_dataset(dataset):
+    return tokenizer(dataset["text"])
+dataset = dataset.map(tokenize_dataset, batched=True)
+```
+
+Load a data collator to create batches of data and pass the tokenizer to it.
+
+```py
+from transformers import DataCollatorWithPadding
+
+data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+```
+
+Next, set up [`TrainingArguments`] with the training features and hyperparameters.
+
+```py
+from transformers import TrainingArguments
+
+training_args = TrainingArguments(
+    output_dir="distilbert-rotten-tomatoes",
+    learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=2,
+    push_to_hub=True,
+)
+```
+
+Finally, pass all these separate components to [`Trainer`] and call [`~Trainer.train`] to start.
+
+```py
+from transformers import Trainer
+
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset["train"],
+    eval_dataset=dataset["test"],
+    tokenizer=tokenizer,
+    data_collator=data_collator,
+)
+
+trainer.train()
+```
+
+Share your model and tokenizer to the Hub with [`~Trainer.push_to_hub`].
+
+```py
+trainer.push_to_hub()
+```
+
+Congratulations, you just trained your first model with Transformers!
+
+### TensorFlow
+
+> [!WARNING]
+> Not all pretrained models are available in TensorFlow. Refer to a models API doc to check whether a TensorFlow implementation is supported.
+
+[`Trainer`] doesn't work with TensorFlow models, but you can still train a Transformers model implemented in TensorFlow with [Keras](https://keras.io/). Transformers TensorFlow models are a standard [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model), which is compatible with Keras' [compile](https://keras.io/api/models/model_training_apis/#compile-method) and [fit](https://keras.io/api/models/model_training_apis/#fit-method) methods.
+
+Load a model, tokenizer, and dataset for training.
+
+```py
+from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
+
+model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+Create a function to tokenize the text and convert it into TensorFlow tensors. Apply this function to the whole dataset with the [`~datasets.Dataset.map`] method.
+
+```py
+def tokenize_dataset(dataset):
+    return tokenizer(dataset["text"])
+dataset = dataset.map(tokenize_dataset)
+```
+
+Transformers provides the [`~TFPreTrainedModel.prepare_tf_dataset`] method to collate and batch a dataset.
+
+```py
+tf_dataset = model.prepare_tf_dataset(
+    dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
+)
+```
+
+Finally, call [compile](https://keras.io/api/models/model_training_apis/#compile-method) to configure the model for training and [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start.
+
+```py
+from tensorflow.keras.optimizers import Adam
+
+model.compile(optimizer="adam")
+model.fit(tf_dataset)
+```
+
+## Next steps
+
+Now that you have a better understanding of Transformers and what it offers, it's time to keep exploring and learning what interests you the most.
+
+- **Base classes**: Learn more about the configuration, model and processor classes. This will help you understand how to create and customize models, preprocess different types of inputs (audio, images, multimodal), and how to share your model.
+- **Inference**: Explore the [`Pipeline`] further, inference and chatting with LLMs, agents, and how to optimize inference with your machine learning framework and hardware.
+- **Training**: Study the [`Trainer`] in more detail, as well as distributed training and optimizing training on specific hardware.
+- **Quantization**: Reduce memory and storage requirements with quantization and speed up inference by representing weights with fewer bits.
+- **Resources**: Looking for end-to-end recipes for how to train and inference with a model for a specific task? Check out the task recipes!
diff --git a/docs/transformers/docs/source/en/run_scripts.md b/docs/transformers/docs/source/en/run_scripts.md
new file mode 100644
index 0000000000000000000000000000000000000000..37e00c9974c35458eaa5e79585919898c94aa0bb
--- /dev/null
+++ b/docs/transformers/docs/source/en/run_scripts.md
@@ -0,0 +1,211 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Training scripts
+
+Transformers provides many example training scripts for deep learning frameworks (PyTorch, TensorFlow, Flax) and tasks in [transformers/examples](https://github.com/huggingface/transformers/tree/main/examples). There are additional scripts in [transformers/research projects](https://github.com/huggingface/transformers-research-projects/) and [transformers/legacy](https://github.com/huggingface/transformers/tree/main/examples/legacy), but these aren't actively maintained and requires a specific version of Transformers.
+
+Example scripts are only examples and you may need to adapt the script to your use-case. To help you with this, most scripts are very transparent in how data is preprocessed, allowing you to edit it as necessary.
+
+For any feature you'd like to implement in an example script, please discuss it on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) before submitting a pull request. While we welcome contributions, it is unlikely a pull request that adds more functionality is added at the cost of readability.
+
+This guide will show you how to run an example summarization training script in [PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) and [TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization).
+
+## Setup
+
+Install Transformers from source in a new virtual environment to run the latest version of the example script.
+
+```bash
+git clone https://github.com/huggingface/transformers
+cd transformers
+pip install .
+```
+
+Run the command below to checkout a script from a specific or older version of Transformers.
+
+```bash
+git checkout tags/v3.5.1
+```
+
+After you've setup the correct version, navigate to the example folder of your choice and install the example specific requirements.
+
+```bash
+pip install -r requirements.txt
+```
+
+## Run a script
+
+Start with a smaller dataset by including the `max_train_samples`, `max_eval_samples`, and `max_predict_samples` parameters to truncate the dataset to a maximum number of samples. This helps ensure training works as expected before committing to the entire dataset which can take hours to complete.
+
+> [!WARNING]
+> Not all example scripts support the `max_predict_samples` parameter. Run the command below to check whether a script supports it or not.
+> ```bash
+> examples/pytorch/summarization/run_summarization.py -h
+> ```
+
+The example below fine-tunes [T5-small](https://huggingface.co/google-t5/t5-small) on the [CNN/DailyMail](https://huggingface.co/datasets/abisee/cnn_dailymail) dataset. T5 requires an additional `source_prefix` parameter to prompt it to summarize.
+
+<hfoptions id="script">
+<hfoption id="PyTorch">
+
+The example script downloads and preprocesses a dataset, and then fine-tunes it with [`Trainer`] with a supported model architecture. 
+
+Resuming training from a checkpoint is very useful if training is interrupted because you don't have to start over again. There are two ways to resume training from a checkpoint.
+
+* `--output dir previous_output_dir` resumes training from the latest checkpoint stored in `output_dir`. Remove the `--overwrite_output_dir` parameter if you're using this method.
+* `--resume_from_checkpoint path_to_specific_checkpoint` resumes training from a specific checkpoint folder.
+
+Share your model on the [Hub](https://huggingface.co/) with the `--push_to_hub` parameter. It creates a repository and uploads the model to the folder name specified in `--output_dir`. You could also use the `--push_to_hub_model_id` parameter to specify the repository name.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --do_train \
+    --do_eval \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --push_to_hub \
+    --push_to_hub_model_id finetuned-t5-cnn_dailymail \
+    # remove if using `output_dir previous_output_dir`
+    # --overwrite_output_dir \
+    --output_dir previous_output_dir \
+    # --resume_from_checkpoint path_to_specific_checkpoint \
+    --predict_with_generate \
+```
+
+For mixed precision and distributed training, include the following parameters and launch training with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
+
+* Add the `fp16` or `bf16` parameters to enable mixed precision training. XPU devices only supports `bf16`.
+* Add the `nproc_per_node` parameter to set number of GPUs to train with.
+
+```bash
+torchrun \
+    --nproc_per_node 8 pytorch/summarization/run_summarization.py \
+    --fp16 \
+    ...
+    ...
+```
+
+PyTorch supports TPUs, hardware designed to accelerate performance, through the [PyTorch/XLA](https://github.com/pytorch/xla/blob/master/README.md) package. Launch the `xla_spawn.py` script and use `num _cores` to set the number of TPU cores to train with.
+
+```bash
+python xla_spawn.py --num_cores 8 pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    ...
+    ...
+```
+
+</hfoption>
+<hfoption id="TensorFlow">
+
+```bash
+python examples/tensorflow/summarization/run_summarization.py  \
+    --model_name_or_path google-t5/t5-small \
+    # remove the `max_train_samples`, `max_eval_samples` and `max_predict_samples` if everything works
+    --max_train_samples 50 \
+    --max_eval_samples 50 \
+    --max_predict_samples 50 \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --output_dir /tmp/tst-summarization  \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 16 \
+    --num_train_epochs 3 \
+    --do_train \
+    --do_eval \
+```
+
+TensorFlow uses the [MirroredStrategy](https://www.tensorflow.org/guide/distributed_training#mirroredstrategy) for distributed training and doesn't require adding any additional parameters. The script uses multiple GPUs by default if they are available.
+
+For TPU training, TensorFlow scripts use the [TPUStrategy](https://www.tensorflow.org/guide/distributed_training#tpustrategy). Pass the TPU resource name to the `--tpu` parameter.
+
+```bash
+python run_summarization.py  \
+    --tpu name_of_tpu_resource \
+    ...
+    ...
+```
+
+</hfoption>
+</hfoptions>
+
+## Accelerate
+
+[Accelerate](https://huggingface.co/docs/accelerate) is designed to simplify distributed training while offering complete visibility into the PyTorch training loop. If you're planning on training with a script with Accelerate, use the `_no_trainer.py` version of the script.
+
+Install Accelerate from source to ensure you have the latest version.
+
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+Run the [accelerate config](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) command to answer a few questions about your training setup. This creates and saves a config file about your system. 
+
+```bash
+accelerate config
+```
+
+You can use [accelerate test](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-test) to ensure your system is properly configured.
+
+```bash
+accelerate test
+```
+
+Run [accelerate launch](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) to start training.
+
+```bash
+accelerate launch run_summarization_no_trainer.py \
+    --model_name_or_path google-t5/t5-small \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
+    --output_dir ~/tmp/tst-summarization \
+```
+
+## Custom dataset
+
+The summarization scripts supports custom datasets as long as they are a CSV or JSONL file. When using your own dataset, you need to specify the following additional parameters.
+
+* `train_file` and `validation_file` specify the path to your training and validation files.
+* `text_column` is the input text to summarize.
+* `summary_column` is the target text to output.
+
+An example command for summarizing a custom dataset is shown below.
+
+```bash
+python examples/pytorch/summarization/run_summarization.py \
+    --model_name_or_path google-t5/t5-small \
+    --do_train \
+    --do_eval \
+    --train_file path_to_csv_or_jsonlines_file \
+    --validation_file path_to_csv_or_jsonlines_file \
+    --text_column text_column_name \
+    --summary_column summary_column_name \
+    --source_prefix "summarize: " \
+    --output_dir /tmp/tst-summarization \
+    --overwrite_output_dir \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --predict_with_generate \
+```
diff --git a/docs/transformers/docs/source/en/serialization.md b/docs/transformers/docs/source/en/serialization.md
new file mode 100644
index 0000000000000000000000000000000000000000..83d8832bac12ad77a7292595752ccf011e2cd499
--- /dev/null
+++ b/docs/transformers/docs/source/en/serialization.md
@@ -0,0 +1,99 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ONNX
+
+[ONNX](http://onnx.ai) is an open standard that defines a common set of operators and a file format to represent deep learning models in different frameworks, including PyTorch and TensorFlow. When a model is exported to ONNX, the operators construct a computational graph (or *intermediate representation*) which represents the flow of data through the model. Standardized operators and data types makes it easy to switch between frameworks.
+
+The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to ONNX with configuration objects which are supported for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)) and can be easily extended. If a model isn't supported, feel free to make a [contribution](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) to Optimum.
+
+The benefits of exporting to ONNX include the following.
+
+- [Graph optimization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) and [quantization](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) for improving inference.
+- Use the [`~optimum.onnxruntime.ORTModel`] API to run a model with [ONNX Runtime](https://onnxruntime.ai/).
+- Use [optimized inference pipelines](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines) for ONNX models.
+
+Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxruntime` module.
+
+## Optimum CLI
+
+Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.
+
+```bash
+pip install optimum[exporters]
+```
+
+> [!TIP]
+> Refer to the [Export a model to ONNX with optimum.exporters.onnx](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) guide for all available arguments or with the command below.
+> ```bash
+> optimum-cli export onnx --help
+> ```
+
+Set the `--model` argument to export a PyTorch or TensorFlow model from the Hub.
+
+```bash
+optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+You should see logs indicating the progress and showing where the resulting `model.onnx` is saved.
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+The `model.onnx` file can be deployed with any [accelerator](https://onnx.ai/supported-tools.html#deployModel) that supports ONNX. The example below demonstrates loading and running a model with ONNX Runtime.
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+## optimum.onnxruntime
+
+The `optimum.onnxruntime` module supports programmatically exporting a Transformers model. Instantiate a [`~optimum.onnxruntime.ORTModel`] for a task and set `export=True`. Use [`~OptimizedModel.save_pretrained`] to save the ONNX model.
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert/distilbert-base-uncased-distilled-squad"
+>>> save_directory = "onnx/"
+
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
diff --git a/docs/transformers/docs/source/en/serving.md b/docs/transformers/docs/source/en/serving.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c665e4bc4bc97789a32cbc76d40cabf45061ad4
--- /dev/null
+++ b/docs/transformers/docs/source/en/serving.md
@@ -0,0 +1,64 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Serving
+
+Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
+
+## TGI
+
+[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
+
+> [!TIP]
+> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
+
+Serve a Transformers implementation the same way you'd serve a TGI model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+Add `--trust-remote_code` to the command to serve a custom Transformers model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+## vLLM
+
+[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
+
+Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
+
+> [!TIP]
+> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
+
+By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+```
+
+Add the `trust-remote-code` parameter to enable loading a remote code model.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+    --trust-remote-code \
+```
\ No newline at end of file
diff --git a/docs/transformers/docs/source/en/task_summary.md b/docs/transformers/docs/source/en/task_summary.md
new file mode 100644
index 0000000000000000000000000000000000000000..e06081a93d049d5620eb9102b99364b928e83892
--- /dev/null
+++ b/docs/transformers/docs/source/en/task_summary.md
@@ -0,0 +1,338 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# What 🤗 Transformers can do
+
+🤗 Transformers is a library of pretrained state-of-the-art models for natural language processing (NLP), computer vision, and audio and speech processing tasks. Not only does the library contain Transformer models, but it also has non-Transformer models like modern convolutional networks for computer vision tasks. If you look at some of the most popular consumer products today, like smartphones, apps, and televisions, odds are that some kind of deep learning technology is behind it. Want to remove a background object from a picture taken by your smartphone? This is an example of a panoptic segmentation task (don't worry if you don't know what this means yet, we'll describe it in the following sections!). 
+
+This page provides an overview of the different speech and audio, computer vision, and NLP tasks that can be solved with the 🤗 Transformers library in just three lines of code!
+
+## Audio
+
+Audio and speech processing tasks are a little different from the other modalities mainly because audio as an input is a continuous signal. Unlike text, a raw audio waveform can't be neatly split into discrete chunks the way a sentence can be divided into words. To get around this, the raw audio signal is typically sampled at regular intervals. If you take more samples within an interval, the sampling rate is higher, and the audio more closely resembles the original audio source.
+
+Previous approaches preprocessed the audio to extract useful features from it. It is now more common to start audio and speech processing tasks by directly feeding the raw audio waveform to a feature encoder to extract an audio representation. This simplifies the preprocessing step and allows the model to learn the most essential features.
+
+### Audio classification
+
+Audio classification is a task that labels audio data from a predefined set of classes. It is a broad category with many specific applications, some of which include:
+
+* acoustic scene classification: label audio with a scene label ("office", "beach", "stadium")
+* acoustic event detection: label audio with a sound event label ("car horn", "whale calling", "glass breaking")
+* tagging: label audio containing multiple sounds (birdsongs, speaker identification in a meeting)
+* music classification: label music with a genre label ("metal", "hip-hop", "country")
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### Automatic speech recognition
+
+Automatic speech recognition (ASR) transcribes speech into text. It is one of the most common audio tasks due partly to speech being such a natural form of human communication. Today, ASR systems are embedded in "smart" technology products like speakers, phones, and cars. We can ask our virtual assistants to play music, set reminders, and tell us the weather. 
+
+But one of the key challenges Transformer architectures have helped with is in low-resource languages. By pretraining on large amounts of speech data, finetuning the model on only one hour of labeled speech data in a low-resource language can still produce high-quality results compared to previous ASR systems trained on 100x more labeled data.
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## Computer vision
+
+One of the first and earliest successful computer vision tasks was recognizing images of zip code numbers using a [convolutional neural network (CNN)](glossary#convolution). An image is composed of pixels, and each pixel has a numerical value. This makes it easy to represent an image as a matrix of pixel values. Each particular combination of pixel values describes the colors of an image. 
+
+Two general ways computer vision tasks can be solved are:
+
+1. Use convolutions to learn the hierarchical features of an image from low-level features to high-level abstract things.
+2. Split an image into patches and use a Transformer to gradually learn how each image patch is related to each other to form an image. Unlike the bottom-up approach favored by a CNN, this is kind of like starting out with a blurry image and then gradually bringing it into focus.
+
+### Image classification
+
+Image classification labels an entire image from a predefined set of classes. Like most classification tasks, there are many practical use cases for image classification, some of which include:
+
+* healthcare: label medical images to detect disease or monitor patient health
+* environment: label satellite images to monitor deforestation, inform wildland management or detect wildfires
+* agriculture: label images of crops to monitor plant health or satellite images for land use monitoring 
+* ecology: label images of animal or plant species to monitor wildlife populations or track endangered species
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### Object detection
+
+Unlike image classification, object detection identifies multiple objects within an image and the objects' positions in an image (defined by the bounding box). Some example applications of object detection include:
+
+* self-driving vehicles: detect everyday traffic objects such as other vehicles, pedestrians, and traffic lights
+* remote sensing: disaster monitoring, urban planning, and weather forecasting
+* defect detection: detect cracks or structural damage in buildings, and manufacturing defects
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### Image segmentation
+
+Image segmentation is a pixel-level task that assigns every pixel in an image to a class. It differs from object detection, which uses bounding boxes to label and predict objects in an image because segmentation is more granular. Segmentation can detect objects at a pixel-level. There are several types of image segmentation:
+
+* instance segmentation: in addition to labeling the class of an object, it also labels each distinct instance of an object ("dog-1", "dog-2")
+* panoptic segmentation: a combination of semantic and instance segmentation; it labels each pixel with a semantic class **and** each distinct instance of an object
+
+Segmentation tasks are helpful in self-driving vehicles to create a pixel-level map of the world around them so they can navigate safely around pedestrians and other vehicles. It is also useful for medical imaging, where the task's finer granularity can help identify abnormal cells or organ features. Image segmentation can also be used in ecommerce to virtually try on clothes or create augmented reality experiences by overlaying objects in the real world through your camera.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### Depth estimation
+
+Depth estimation predicts the distance of each pixel in an image from the camera. This computer vision task is especially important for scene understanding and reconstruction. For example, in self-driving cars, vehicles need to understand how far objects like pedestrians, traffic signs, and other vehicles are to avoid obstacles and collisions. Depth information is also helpful for constructing 3D representations from 2D images and can be used to create high-quality 3D representations of biological structures or buildings.
+
+There are two approaches to depth estimation:
+
+* stereo: depths are estimated by comparing two images of the same image from slightly different angles
+* monocular: depths are estimated from a single image
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## Natural language processing
+
+NLP tasks are among the most common types of tasks because text is such a natural way for us to communicate. To get text into a format recognized by a model, it needs to be tokenized. This means dividing a sequence of text into separate words or subwords (tokens) and then converting these tokens into numbers. As a result, you can represent a sequence of text as a sequence of numbers, and once you have a sequence of numbers, it can be input into a model to solve all sorts of NLP tasks!
+
+### Text classification
+
+Like classification tasks in any modality, text classification labels a sequence of text (it can be sentence-level, a paragraph, or a document) from a predefined set of classes. There are many practical applications for text classification, some of which include:
+
+* sentiment analysis: label text according to some polarity like `positive` or `negative` which can inform and support decision-making in fields like politics, finance, and marketing
+* content classification: label text according to some topic to help organize and filter information in news and social media feeds (`weather`, `sports`, `finance`, etc.)
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### Token classification
+
+In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as [tokens](glossary#token). Token classification assigns each token a label from a predefined set of classes. 
+
+Two common types of token classification are:
+
+* named entity recognition (NER): label a token according to an entity category like organization, person, location or date. NER is especially popular in biomedical settings, where it can label genes, proteins, and drug names.
+* part-of-speech tagging (POS): label a token according to its part-of-speech like noun, verb, or adjective. POS is useful for helping translation systems understand how two identical words are grammatically different (bank as a noun versus bank as a verb).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### Question answering
+
+Question answering is another token-level task that returns an answer to a question, sometimes with context (open-domain) and other times without context (closed-domain). This task happens whenever we ask a virtual assistant something like whether a restaurant is open. It can also provide customer or technical support and help search engines retrieve the relevant information you're asking for. 
+
+There are two common types of question answering:
+
+* extractive: given a question and some context, the answer is a span of text from the context the model must extract
+* abstractive: given a question and some context, the answer is generated from the context; this approach is handled by the [`Text2TextGenerationPipeline`] instead of the [`QuestionAnsweringPipeline`] shown below
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### Summarization
+
+Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. Summarization is a sequence-to-sequence task; it outputs a shorter text sequence than the input. There are a lot of long-form documents that can be summarized to help readers quickly understand the main points. Legislative bills, legal and financial documents, patents, and scientific papers are a few examples of documents that could be summarized to save readers time and serve as a reading aid.
+
+Like question answering, there are two types of summarization:
+
+* extractive: identify and extract the most important sentences from the original text
+* abstractive: generate the target summary (which may include new words not in the input document) from the original text; the [`SummarizationPipeline`] uses the abstractive approach
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### Translation
+
+Translation converts a sequence of text in one language to another. It is important in helping people from different backgrounds communicate with each other, help translate content to reach wider audiences, and even be a learning tool to help people learn a new language. Along with summarization, translation is a sequence-to-sequence task, meaning the model receives an input sequence and returns a target output sequence. 
+
+In the early days, translation models were mostly monolingual, but recently, there has been increasing interest in multilingual models that can translate between many pairs of languages.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### Language modeling
+
+Language modeling is a task that predicts a word in a sequence of text. It has become a very popular NLP task because a pretrained language model can be finetuned for many other downstream tasks. Lately, there has been a lot of interest in large language models (LLMs) which demonstrate zero- or few-shot learning. This means the model can solve tasks it wasn't explicitly trained to do! Language models can be used to generate fluent and convincing text, though you need to be careful since the text may not always be accurate.
+
+There are two types of language modeling:
+
+* causal: the model's objective is to predict the next token in a sequence, and future tokens are masked
+
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+* masked: the model's objective is to predict a masked token in a sequence with full access to the tokens in the sequence
+    
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.224, 'token': 3944, 'token_str': ' tool', 'sequence': 'Hugging Face is a community-based open-source tool for machine learning.'}]
+    ```
+
+## Multimodal
+
+Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image. 
+
+Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). For a task like image captioning, the model learns relationships between image embeddings and text embeddings.
+
+### Document question answering
+
+Document question answering is a task that answers natural language questions from a document. Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. Document question answering can be used to parse structured documents and extract key information from it. In the example below, the total amount and change due can be extracted from a receipt.
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="What is the total amount?",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next [section](tasks_explained), you'll learn **how** 🤗 Transformers work to solve these tasks.
\ No newline at end of file
diff --git a/docs/transformers/docs/source/en/tasks_explained.md b/docs/transformers/docs/source/en/tasks_explained.md
new file mode 100644
index 0000000000000000000000000000000000000000..1cc60ba096f04f56f6461db262f2721c5fbd3fd7
--- /dev/null
+++ b/docs/transformers/docs/source/en/tasks_explained.md
@@ -0,0 +1,295 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# How 🤗 Transformers solve tasks
+
+In [What 🤗 Transformers can do](task_summary), you learned about natural language processing (NLP), speech and audio, computer vision tasks, and some important applications of them. This page will look closely at how models solve these tasks and explain what's happening under the hood. There are many ways to solve a given task, some models may implement certain techniques or even approach the task from a new angle, but for Transformer models, the general idea is the same. Owing to its flexible architecture, most models are a variant of an encoder, a decoder, or an encoder-decoder structure. In addition to Transformer models, our library also has several convolutional neural networks (CNNs), which are still used today for computer vision tasks. We'll also explain how a modern CNN works.
+
+To explain how tasks are solved, we'll walk through what goes on inside the model to output useful predictions.
+
+- [Wav2Vec2](model_doc/wav2vec2) for audio classification and automatic speech recognition (ASR)
+- [Vision Transformer (ViT)](model_doc/vit) and [ConvNeXT](model_doc/convnext) for image classification
+- [DETR](model_doc/detr) for object detection
+- [Mask2Former](model_doc/mask2former) for image segmentation
+- [GLPN](model_doc/glpn) for depth estimation
+- [BERT](model_doc/bert) for NLP tasks like text classification, token classification and question answering that use an encoder
+- [GPT2](model_doc/gpt2) for NLP tasks like text generation that use a decoder
+- [BART](model_doc/bart) for NLP tasks like summarization and translation that use an encoder-decoder
+
+<Tip>
+
+Before you go further, it is good to have some basic knowledge of the original Transformer architecture. Knowing how encoders, decoders, and attention work will aid you in understanding how different Transformer models work. If you're just getting started or need a refresher, check out our [course](https://huggingface.co/course/chapter1/4?fw=pt) for more information! 
+
+</Tip>
+
+## Speech and audio
+
+[Wav2Vec2](model_doc/wav2vec2) is a self-supervised model pretrained on unlabeled speech data and finetuned on labeled data for audio classification and automatic speech recognition. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+This model has four main components:
+
+1. A *feature encoder* takes the raw audio waveform, normalizes it to zero mean and unit variance, and converts it into a sequence of feature vectors that are each 20ms long.
+
+2. Waveforms are continuous by nature, so they can't be divided into separate units like a sequence of text can be split into words. That's why the feature vectors are passed to a *quantization module*, which aims to learn discrete speech units. The speech unit is chosen from a collection of codewords, known as a *codebook* (you can think of this as the vocabulary). From the codebook, the vector or speech unit, that best represents the continuous audio input is chosen and forwarded through the model.
+
+3. About half of the feature vectors are randomly masked, and the masked feature vector is fed to a *context network*, which is a Transformer encoder that also adds relative positional embeddings.
+
+4. The pretraining objective of the context network is a *contrastive task*. The model has to predict the true quantized speech representation of the masked prediction from a set of false ones, encouraging the model to find the most similar context vector and quantized speech unit (the target label).
+
+Now that wav2vec2 is pretrained, you can finetune it on your data for audio classification or automatic speech recognition!
+
+### Audio classification
+
+To use the pretrained model for audio classification, add a sequence classification head on top of the base Wav2Vec2 model. The classification head is a linear layer that accepts the encoder's hidden states. The hidden states represent the learned features from each audio frame which can have varying lengths. To create one vector of fixed-length, the hidden states are pooled first and then transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and target to find the most likely class.
+
+Ready to try your hand at audio classification? Check out our complete [audio classification guide](tasks/audio_classification) to learn how to finetune Wav2Vec2 and use it for inference!
+
+### Automatic speech recognition
+
+To use the pretrained model for automatic speech recognition, add a language modeling head on top of the base Wav2Vec2 model for [connectionist temporal classification (CTC)](glossary#connectionist-temporal-classification-ctc). The language modeling head is a linear layer that accepts the encoder's hidden states and transforms them into logits. Each logit represents a token class (the number of tokens comes from the task vocabulary). The CTC loss is calculated between the logits and targets to find the most likely sequence of tokens, which are then decoded into a transcription.
+
+Ready to try your hand at automatic speech recognition? Check out our complete [automatic speech recognition guide](tasks/asr) to learn how to finetune Wav2Vec2 and use it for inference!
+
+## Computer vision
+
+There are two ways to approach computer vision tasks:
+
+1. Split an image into a sequence of patches and process them in parallel with a Transformer.
+2. Use a modern CNN, like [ConvNeXT](model_doc/convnext), which relies on convolutional layers but adopts modern network designs.
+
+<Tip>
+
+A third approach mixes Transformers with convolutions (for example, [Convolutional Vision Transformer](model_doc/cvt) or [LeViT](model_doc/levit)). We won't discuss those because they just combine the two approaches we examine here.
+
+</Tip>
+
+ViT and ConvNeXT are commonly used for image classification, but for other vision tasks like object detection, segmentation, and depth estimation, we'll look at DETR, Mask2Former and GLPN, respectively; these models are better suited for those tasks.
+
+### Image classification
+
+ViT and ConvNeXT can both be used for image classification; the main difference is that ViT uses an attention mechanism while ConvNeXT uses convolutions.
+
+#### Transformer
+
+[ViT](model_doc/vit) replaces convolutions entirely with a pure Transformer architecture. If you're familiar with the original Transformer, then you're already most of the way toward understanding ViT.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+The main change ViT introduced was in how images are fed to a Transformer:
+
+1. An image is split into square non-overlapping patches, each of which gets turned into a vector or *patch embedding*. The patch embeddings are generated from a convolutional 2D layer which creates the proper input dimensions (which for a base Transformer is 768 values for each patch embedding). If you had a 224x224 pixel image, you could split it into 196 16x16 image patches. Just like how text is tokenized into words, an image is "tokenized" into a sequence of patches.
+
+2. A *learnable embedding* - a special `[CLS]` token - is added to the beginning of the patch embeddings just like BERT. The final hidden state of the `[CLS]` token is used as the input to the attached classification head; other outputs are ignored. This token helps the model learn how to encode a representation of the image.
+
+3. The last thing to add to the patch and learnable embeddings are the *position embeddings* because the model doesn't know how the image patches are ordered. The position embeddings are also learnable and have the same size as the patch embeddings. Finally, all of the embeddings are passed to the Transformer encoder.
+
+4. The output, specifically only the output with the `[CLS]` token, is passed to a multilayer perceptron head (MLP). ViT's pretraining objective is simply classification. Like other classification heads, the MLP head converts the output into logits over the class labels and calculates the cross-entropy loss to find the most likely class.
+
+Ready to try your hand at image classification? Check out our complete [image classification guide](tasks/image_classification) to learn how to finetune ViT and use it for inference!
+
+#### CNN
+
+<Tip>
+
+This section briefly explains convolutions, but it'd be helpful to have a prior understanding of how they change an image's shape and size. If you're unfamiliar with convolutions, check out the [Convolution Neural Networks chapter](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) from the fastai book!
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext) is a CNN architecture that adopts new and modern network designs to improve performance. However, convolutions are still at the core of the model. From a high-level perspective, a [convolution](glossary#convolution) is an operation where a smaller matrix (*kernel*) is multiplied by a small window of the image pixels. It computes some features from it, such as a particular texture or curvature of a line. Then it slides over to the next window of pixels; the distance the convolution travels is known as the *stride*. 
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>A basic convolution without padding or stride, taken from <a href="https://arxiv.org/abs/1603.07285">A guide to convolution arithmetic for deep learning.</a></small>
+
+You can feed this output to another convolutional layer, and with each successive layer, the network learns more complex and abstract things like hotdogs or rockets. Between convolutional layers, it is common to add a pooling layer to reduce dimensionality and make the model more robust to variations of a feature's position.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+ConvNeXT modernizes a CNN in five ways:
+
+1. Change the number of blocks in each stage and "patchify" an image with a larger stride and corresponding kernel size. The non-overlapping sliding window makes this patchifying strategy similar to how ViT splits an image into patches.
+
+2. A *bottleneck* layer shrinks the number of channels and then restores it because it is faster to do a 1x1 convolution, and you can increase the depth. An inverted bottleneck does the opposite by expanding the number of channels and shrinking them, which is more memory efficient.
+
+3. Replace the typical 3x3 convolutional layer in the bottleneck layer with *depthwise convolution*, which applies a convolution to each input channel separately and then stacks them back together at the end. This widens the network width for improved performance.
+
+4. ViT has a global receptive field which means it can see more of an image at once thanks to its attention mechanism. ConvNeXT attempts to replicate this effect by increasing the kernel size to 7x7.
+
+5. ConvNeXT also makes several layer design changes that imitate Transformer models. There are fewer activation and normalization layers,  the activation function is switched to GELU instead of ReLU, and it uses LayerNorm instead of BatchNorm.
+
+The output from the convolution blocks is passed to a classification head which converts the outputs into logits and calculates the cross-entropy loss to find the most likely label.
+
+### Object detection
+
+[DETR](model_doc/detr), *DEtection TRansformer*, is an end-to-end object detection model that combines a CNN with a Transformer encoder-decoder.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+1. A pretrained CNN *backbone* takes an image, represented by its pixel values, and creates a low-resolution feature map of it. A 1x1 convolution is applied to the feature map to reduce dimensionality and it creates a new feature map with a high-level image representation. Since the Transformer is a sequential model, the feature map is flattened into a sequence of feature vectors that are combined with positional embeddings.
+
+2. The feature vectors are passed to the encoder, which learns the image representations using its attention layers. Next, the encoder hidden states are combined with *object queries* in the decoder. Object queries are learned embeddings that focus on the different regions of an image, and they're updated as they progress through each attention layer. The decoder hidden states are passed to a feedforward network that predicts the bounding box coordinates and class label for each object query, or `no object` if there isn't one.
+
+    DETR decodes each object query in parallel to output *N* final predictions, where *N* is the number of queries. Unlike a typical autoregressive model that predicts one element at a time, object detection is a set prediction task (`bounding box`, `class label`) that makes *N* predictions in a single pass.
+
+3. DETR uses a *bipartite matching loss* during training to compare a fixed number of predictions with a fixed set of ground truth labels. If there are fewer ground truth labels in the set of *N* labels, then they're padded with a `no object` class. This loss function encourages DETR to find a one-to-one assignment between the predictions and ground truth labels. If either the bounding boxes or class labels aren't correct, a loss is incurred. Likewise, if DETR predicts an object that doesn't exist, it is penalized. This encourages DETR to find other objects in an image instead of focusing on one really prominent object.
+
+An object detection head is added on top of DETR to find the class label and the coordinates of the bounding box. There are two components to the object detection head: a linear layer to transform the decoder hidden states into logits over the class labels, and a MLP to predict the bounding box.
+
+Ready to try your hand at object detection? Check out our complete [object detection guide](tasks/object_detection) to learn how to finetune DETR and use it for inference!
+
+### Image segmentation
+
+[Mask2Former](model_doc/mask2former) is a universal architecture for solving all types of image segmentation tasks. Traditional segmentation models are typically tailored towards a particular subtask of image segmentation, like instance, semantic or panoptic segmentation. Mask2Former frames each of those tasks as a *mask classification* problem. Mask classification groups pixels into *N* segments, and predicts *N* masks and their corresponding class label for a given image. We'll explain how Mask2Former works in this section, and then you can try finetuning SegFormer at the end.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+There are three main components to Mask2Former:
+
+1. A [Swin](model_doc/swin) backbone accepts an image and creates a low-resolution image feature map from 3 consecutive 3x3 convolutions.
+
+2. The feature map is passed to a *pixel decoder* which gradually upsamples the low-resolution features into high-resolution per-pixel embeddings. The pixel decoder actually generates multi-scale features (contains both low- and high-resolution features) with resolutions 1/32, 1/16, and 1/8th of the original image.
+
+3. Each of these feature maps of differing scales is fed successively to one Transformer decoder layer at a time in order to capture small objects from the high-resolution features. The key to Mask2Former is the *masked attention* mechanism in the decoder. Unlike cross-attention which can attend to the entire image, masked attention only focuses on a certain area of the image. This is faster and leads to better performance because the local features of an image are enough for the model to learn from.
+
+4. Like [DETR](tasks_explained#object-detection), Mask2Former also uses learned object queries and combines them with the image features from the pixel decoder to make a set prediction (`class label`, `mask prediction`). The decoder hidden states are passed into a linear layer and transformed into logits over the class labels. The cross-entropy loss is calculated between the logits and class label to find the most likely one.
+
+    The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask.
+
+Ready to try your hand at image segmentation? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
+
+### Depth estimation
+
+[GLPN](model_doc/glpn), *Global-Local Path Network*, is a Transformer for depth estimation that combines a [SegFormer](model_doc/segformer) encoder with a lightweight decoder.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+
+1. Like ViT, an image is split into a sequence of patches, except these image patches are smaller. This is better for dense prediction tasks like segmentation or depth estimation. The image patches are transformed into patch embeddings (see the [image classification](#image-classification) section for more details about how patch embeddings are created), which are fed to the encoder.
+
+2. The encoder accepts the patch embeddings, and passes them through several encoder blocks. Each block consists of attention and Mix-FFN layers. The purpose of the latter is to provide positional information. At the end of each encoder block is a *patch merging* layer for creating hierarchical representations. The features of each group of neighboring patches are concatenated, and a linear layer is applied to the concatenated features to reduce the number of patches to a resolution of 1/4. This becomes the input to the next encoder block, where this whole process is repeated until you have image features with resolutions of 1/8, 1/16, and 1/32.
+
+3. A lightweight decoder takes the last feature map (1/32 scale) from the encoder and upsamples it to 1/16 scale. From here, the feature is passed into a *Selective Feature Fusion (SFF)* module, which selects and combines local and global features from an attention map for each feature and then upsamples it to 1/8th. This process is repeated until the decoded features are the same size as the original image. The output is passed through two convolution layers and then a sigmoid activation is applied to predict the depth of each pixel.
+
+## Natural language processing
+
+The Transformer was initially designed for machine translation, and since then, it has practically become the default architecture for solving all NLP tasks. Some tasks lend themselves to the Transformer's encoder structure, while others are better suited for the decoder. Still, other tasks make use of both the Transformer's encoder-decoder structure.
+
+### Text classification
+
+[BERT](model_doc/bert) is an encoder-only model and is the first model to effectively implement deep bidirectionality to learn richer representations of the text by attending to words on both sides.
+
+1. BERT uses [WordPiece](tokenizer_summary#wordpiece) tokenization to generate a token embedding of the text. To tell the difference between a single sentence and a pair of sentences, a special `[SEP]` token is added to differentiate them. A special `[CLS]` token is added to the beginning of every sequence of text. The final output with the `[CLS]` token is used as the input to the classification head for classification tasks. BERT also adds a segment embedding to denote whether a token belongs to the first or second sentence in a pair of sentences.
+
+2. BERT is pretrained with two objectives: masked language modeling and next-sentence prediction. In masked language modeling, some percentage of the input tokens are randomly masked, and the model needs to predict these. This solves the issue of bidirectionality, where the model could cheat and see all the words and "predict" the next word. The final hidden states of the predicted mask tokens are passed to a feedforward network with a softmax over the vocabulary to predict the masked word.
+
+    The second pretraining object is next-sentence prediction. The model must predict whether sentence B follows sentence A. Half of the time sentence B is the next sentence, and the other half of the time, sentence B is a random sentence. The prediction, whether it is the next sentence or not, is passed to a feedforward network with a softmax over the two classes (`IsNext` and `NotNext`).
+
+3. The input embeddings are passed through multiple encoder layers to output some final hidden states.
+
+To use the pretrained model for text classification, add a sequence classification head on top of the base BERT model. The sequence classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and target to find the most likely label.
+
+Ready to try your hand at text classification? Check out our complete [text classification guide](tasks/sequence_classification) to learn how to finetune DistilBERT and use it for inference!
+
+### Token classification
+
+To use BERT for token classification tasks like named entity recognition (NER), add a token classification head on top of the base BERT model. The token classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. The cross-entropy loss is calculated between the logits and each token to find the most likely label.
+
+Ready to try your hand at token classification? Check out our complete [token classification guide](tasks/token_classification) to learn how to finetune DistilBERT and use it for inference!
+
+### Question answering
+
+To use BERT for question answering, add a span classification head on top of the base BERT model. This linear layer accepts the final hidden states and performs a linear transformation to compute the `span` start and end logits corresponding to the answer. The cross-entropy loss is calculated between the logits and the label position to find the most likely span of text corresponding to the answer.
+
+Ready to try your hand at question answering? Check out our complete [question answering guide](tasks/question_answering) to learn how to finetune DistilBERT and use it for inference!
+
+<Tip>
+
+💡 Notice how easy it is to use BERT for different tasks once it's been pretrained. You only need to add a specific head to the pretrained model to manipulate the hidden states into your desired output!
+
+</Tip>
+
+### Text generation
+
+[GPT-2](model_doc/gpt2) is a decoder-only model pretrained on a large amount of text. It can generate convincing (though not always true!) text given a prompt and complete other NLP tasks like question answering despite not being explicitly trained to.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. GPT-2 uses [byte pair encoding (BPE)](tokenizer_summary#bytepair-encoding-bpe) to tokenize words and generate a token embedding. Positional encodings are added to the token embeddings to indicate the position of each token in the sequence. The input embeddings are passed through multiple decoder blocks to output some final hidden state. Within each decoder block, GPT-2 uses a *masked self-attention* layer which means GPT-2 can't attend to future tokens. It is only allowed to attend to tokens on the left. This is different from BERT's [`mask`] token because, in masked self-attention, an attention mask is used to set the score to `0` for future tokens.
+
+2. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The label is the next token in the sequence, which are created by shifting the logits to the right by one. The cross-entropy loss is calculated between the shifted logits and the labels to output the next most likely token.
+
+GPT-2's pretraining objective is based entirely on [causal language modeling](glossary#causal-language-modeling), predicting the next word in a sequence. This makes GPT-2 especially good at tasks that involve generating text.
+
+Ready to try your hand at text generation? Check out our complete [causal language modeling guide](tasks/language_modeling#causal-language-modeling) to learn how to finetune DistilGPT-2 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
+
+### Summarization
+
+Encoder-decoder models like [BART](model_doc/bart) and [T5](model_doc/t5) are designed for the sequence-to-sequence pattern of a summarization task. We'll explain how BART works in this section, and then you can try finetuning T5 at the end.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. BART's encoder architecture is very similar to BERT and accepts a token and positional embedding of the text. BART is pretrained by corrupting the input and then reconstructing it with the decoder. Unlike other encoders with specific corruption strategies, BART can apply any type of corruption. The *text infilling* corruption strategy works the best though. In text infilling, a number of text spans are replaced with a **single** [`mask`] token. This is important because the model has to predict the masked tokens, and it teaches the model to predict the number of missing tokens. The input embeddings and masked spans are passed through the encoder to output some final hidden states, but unlike BERT, BART doesn't add a final feedforward network at the end to predict a word.
+
+2. The encoder's output is passed to the decoder, which must predict the masked tokens and any uncorrupted tokens from the encoder's output. This gives additional context to help the decoder restore the original text. The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. The cross-entropy loss is calculated between the logits and the label, which is just the token shifted to the right.
+
+Ready to try your hand at summarization? Check out our complete [summarization guide](tasks/summarization) to learn how to finetune T5 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
+
+### Translation
+
+Translation is another example of a sequence-to-sequence task, which means you can use an encoder-decoder model like [BART](model_doc/bart) or [T5](model_doc/t5) to do it. We'll explain how BART works in this section, and then you can try finetuning T5 at the end.
+
+BART adapts to translation by adding a separate randomly initialized encoder to map a source language to an input that can be decoded into the target language. This new encoder's embeddings are passed to the pretrained encoder instead of the original word embeddings. The source encoder is trained by updating the source encoder, positional embeddings, and input embeddings with the cross-entropy loss from the model output. The model parameters are frozen in this first step, and all the model parameters are trained together in the second step.
+
+BART has since been followed up by a multilingual version, mBART, intended for translation and pretrained on many different languages.
+
+Ready to try your hand at translation? Check out our complete [translation guide](tasks/translation) to learn how to finetune T5 and use it for inference!
+
+<Tip>
+
+For more information about text generation, check out the [text generation strategies](generation_strategies) guide!
+
+</Tip>
diff --git a/docs/transformers/docs/source/en/testing.md b/docs/transformers/docs/source/en/testing.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd0b9cbb4268df904beeea86735702457eea3f8b
--- /dev/null
+++ b/docs/transformers/docs/source/en/testing.md
@@ -0,0 +1,1333 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Testing
+
+
+Let's take a look at how 🤗 Transformers models are tested and how you can write new tests and improve the existing ones.
+
+There are 2 test suites in the repository:
+
+1. `tests` -- tests for the general API
+2. `examples` -- tests primarily for various applications that aren't part of the API
+
+## How transformers are tested
+
+1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs
+   are defined in this [config file](https://github.com/huggingface/transformers/tree/main/.circleci/config.yml), so that if needed you can reproduce the same
+   environment on your machine.
+
+   These CI jobs don't run `@slow` tests.
+
+2. There are 3 jobs run by [github actions](https://github.com/huggingface/transformers/actions):
+
+   - [torch hub integration](https://github.com/huggingface/transformers/tree/main/.github/workflows/github-torch-hub.yml): checks whether torch hub
+     integration works.
+
+   - [self-hosted (push)](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-push.yml): runs fast tests on GPU only on commits on
+     `main`. It only runs if a commit on `main` has updated the code in one of the following folders: `src`,
+     `tests`, `.github` (to prevent running on added model cards, notebooks, etc.)
+
+   - [self-hosted runner](https://github.com/huggingface/transformers/tree/main/.github/workflows/self-scheduled.yml): runs normal and slow tests on GPU in
+     `tests` and `examples`:
+
+```bash
+RUN_SLOW=1 pytest tests/
+RUN_SLOW=1 pytest examples/
+```
+
+   The results can be observed [here](https://github.com/huggingface/transformers/actions).
+
+
+
+## Running tests
+
+
+
+### Choosing which tests to run
+
+This document goes into many details of how tests can be run. If after reading everything, you need even more details
+you will find them [here](https://docs.pytest.org/en/latest/usage.html).
+
+Here are some most useful ways of running tests.
+
+Run all:
+
+```console
+pytest
+```
+
+or:
+
+```bash
+make test
+```
+
+Note that the latter is defined as:
+
+```bash
+python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+which tells pytest to:
+
+- run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!)
+- ensure that all tests from the same file will be run by the same test process
+- do not capture output
+- run in verbose mode
+
+
+
+### Getting the list of all tests
+
+All tests of the test suite:
+
+```bash
+pytest --collect-only -q
+```
+
+All tests of a given test file:
+
+```bash
+pytest tests/test_optimization.py --collect-only -q
+```
+
+### Run a specific test module
+
+To run an individual test module:
+
+```bash
+pytest tests/utils/test_logging.py
+```
+
+### Run specific tests
+
+Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest
+class containing those tests. For example, it could be:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+```
+
+Here:
+
+- `tests/test_optimization.py` - the file with tests
+- `OptimizationTest` - the name of the class
+- `test_adam_w` - the name of the specific test function
+
+If the file contains multiple classes, you can choose to run only tests of a given class. For example:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest
+```
+
+will run all the tests inside that class.
+
+As mentioned earlier you can see what tests are contained inside the `OptimizationTest` class by running:
+
+```bash
+pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+```
+
+You can run tests by keyword expressions.
+
+To run only tests whose name contains `adam`:
+
+```bash
+pytest -k adam tests/test_optimization.py
+```
+
+Logical `and` and `or` can be used to indicate whether all keywords should match or either. `not` can be used to
+negate.
+
+To run all tests except those whose name contains `adam`:
+
+```bash
+pytest -k "not adam" tests/test_optimization.py
+```
+
+And you can combine the two patterns in one:
+
+```bash
+pytest -k "ada and not adam" tests/test_optimization.py
+```
+
+For example to run both `test_adafactor` and `test_adam_w` you can use:
+
+```bash
+pytest -k "test_adafactor or test_adam_w" tests/test_optimization.py
+```
+
+Note that we use `or` here, since we want either of the keywords to match to include both.
+
+If you want to include only tests that include both patterns, `and` is to be used:
+
+```bash
+pytest -k "test and ada" tests/test_optimization.py
+```
+
+### Run `accelerate` tests
+
+Sometimes you need to run `accelerate` tests on your models. For that you can just add `-m accelerate_tests` to your command, if let's say you want to run these tests on `OPT` run:
+
+```bash
+RUN_SLOW=1 pytest -m accelerate_tests tests/models/opt/test_modeling_opt.py
+```
+
+
+### Run documentation tests
+
+In order to test whether the documentation examples are correct, you should check that the `doctests` are passing.
+As an example, let's use [`WhisperModel.forward`'s docstring](https://github.com/huggingface/transformers/blob/1124d95dbb1a3512d3e80791d73d0f541d1d7e9f/src/transformers/models/whisper/modeling_whisper.py#L1591-L1609)
+
+```python
+r"""
+Returns:
+
+Example:
+    ```python
+    >>> import torch
+    >>> from transformers import WhisperModel, WhisperFeatureExtractor
+    >>> from datasets import load_dataset
+
+    >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+    >>> feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+    >>> input_features = inputs.input_features
+    >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+    >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+    >>> list(last_hidden_state.shape)
+    [1, 2, 512]
+    ```"""
+
+```
+
+Just run the following line to automatically test every docstring example in the desired file:
+```bash
+pytest --doctest-modules <path_to_file_or_dir>
+```
+If the file has a markdown extension, you should add the `--doctest-glob="*.md"` argument.
+
+### Run only modified tests
+
+You can run the tests related to the unstaged files or the current branch (according to Git) by using [pytest-picked](https://github.com/anapaulagomes/pytest-picked). This is a great way of quickly testing your changes didn't break
+anything, since it won't run the tests related to files you didn't touch.
+
+```bash
+pip install pytest-picked
+```
+
+```bash
+pytest --picked
+```
+
+All tests will be run from files and folders which are modified, but not yet committed.
+
+### Automatically rerun failed tests on source modification
+
+[pytest-xdist](https://github.com/pytest-dev/pytest-xdist) provides a very useful feature of detecting all failed
+tests, and then waiting for you to modify files and continuously re-rerun those failing tests until they pass while you
+fix them. So that you don't need to re start pytest after you made the fix. This is repeated until all tests pass after
+which again a full run is performed.
+
+```bash
+pip install pytest-xdist
+```
+
+To enter the mode: `pytest -f` or `pytest --looponfail`
+
+File changes are detected by looking at `looponfailroots` root directories and all of their contents (recursively).
+If the default for this value does not work for you, you can change it in your project by setting a configuration
+option in `setup.cfg`:
+
+```ini
+[tool:pytest]
+looponfailroots = transformers tests
+```
+
+or `pytest.ini`/``tox.ini`` files:
+
+```ini
+[pytest]
+looponfailroots = transformers tests
+```
+
+This would lead to only looking for file changes in the respective directories, specified relatively to the ini-file’s
+directory.
+
+[pytest-watch](https://github.com/joeyespo/pytest-watch) is an alternative implementation of this functionality.
+
+
+### Skip a test module
+
+If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For
+example, to run all except `test_modeling_*.py` tests:
+
+```bash
+pytest *ls -1 tests/*py | grep -v test_modeling*
+```
+
+### Clearing state
+
+CI builds and when isolation is important (against speed), cache should be cleared:
+
+```bash
+pytest --cache-clear tests
+```
+
+### Running tests in parallel
+
+As mentioned earlier `make test` runs tests in parallel via `pytest-xdist` plugin (`-n X` argument, e.g. `-n 2`
+to run 2 parallel jobs).
+
+`pytest-xdist`'s `--dist=` option allows one to control how the tests are grouped. `--dist=loadfile` puts the
+tests located in one file onto the same process.
+
+Since the order of executed tests is different and unpredictable, if running the test suite with `pytest-xdist`
+produces failures (meaning we have some undetected coupled tests), use [pytest-replay](https://github.com/ESSS/pytest-replay) to replay the tests in the same order, which should help with then somehow
+reducing that failing sequence to a minimum.
+
+### Test order and repetition
+
+It's good to repeat the tests several times, in sequence, randomly, or in sets, to detect any potential
+inter-dependency and state-related bugs (tear down). And the straightforward multiple repetition is just good to detect
+some problems that get uncovered by randomness of DL.
+
+
+#### Repeat tests
+
+- [pytest-flakefinder](https://github.com/dropbox/pytest-flakefinder):
+
+```bash
+pip install pytest-flakefinder
+```
+
+And then run every test multiple times (50 by default):
+
+```bash
+pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+```
+
+<Tip>
+
+This plugin doesn't work with `-n` flag from `pytest-xdist`.
+
+</Tip>
+
+<Tip>
+
+There is another plugin `pytest-repeat`, but it doesn't work with `unittest`.
+
+</Tip>
+
+#### Run tests in a random order
+
+```bash
+pip install pytest-random-order
+```
+
+Important: the presence of `pytest-random-order` will automatically randomize tests, no configuration change or
+command line options is required.
+
+As explained earlier this allows detection of coupled tests - where one test's state affects the state of another. When
+`pytest-random-order` is installed it will print the random seed it used for that session, e.g:
+
+```bash
+pytest tests
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:
+
+```bash
+pytest --random-order-seed=573663
+[...]
+Using --random-order-bucket=module
+Using --random-order-seed=573663
+```
+
+It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
+manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
+they failed and tell pytest to not randomize them instead using `--random-order-bucket=none`, e.g.:
+
+```bash
+pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+```
+
+To disable the shuffling for all tests:
+
+```bash
+pytest --random-order-bucket=none
+```
+
+By default `--random-order-bucket=module` is implied, which will shuffle the files on the module levels. It can also
+shuffle on `class`, `package`, `global` and `none` levels. For the complete details please see its
+[documentation](https://github.com/jbasko/pytest-random-order).
+
+Another randomization alternative is: [`pytest-randomly`](https://github.com/pytest-dev/pytest-randomly). This
+module has a very similar functionality/interface, but it doesn't have the bucket modes available in
+`pytest-random-order`. It has the same problem of imposing itself once installed.
+
+### Look and feel variations
+
+#### pytest-sugar
+
+[pytest-sugar](https://github.com/Frozenball/pytest-sugar) is a plugin that improves the look-n-feel, adds a
+progressbar, and show tests that fail and the assert instantly. It gets activated automatically upon installation.
+
+```bash
+pip install pytest-sugar
+```
+
+To run tests without it, run:
+
+```bash
+pytest -p no:sugar
+```
+
+or uninstall it.
+
+
+
+#### Report each sub-test name and its progress
+
+For a single or a group of tests via `pytest` (after `pip install pytest-pspec`):
+
+```bash
+pytest --pspec tests/test_optimization.py
+```
+
+#### Instantly shows failed tests
+
+[pytest-instafail](https://github.com/pytest-dev/pytest-instafail) shows failures and errors instantly instead of
+waiting until the end of test session.
+
+```bash
+pip install pytest-instafail
+```
+
+```bash
+pytest --instafail
+```
+
+### To GPU or not to GPU
+
+On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs:
+
+```bash
+CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
+```
+
+or if you have multiple gpus, you can specify which one is to be used by `pytest`. For example, to use only the
+second gpu if you have gpus `0` and `1`, you can run:
+
+```bash
+CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
+```
+
+For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example.
+
+This is handy when you want to run different tasks on different GPUs.
+
+Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
+decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise:
+
+- `require_torch` - this test will run only under torch
+- `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU
+- `require_torch_multi_gpu` - as `require_torch` plus requires at least 2 GPUs
+- `require_torch_non_multi_gpu` - as `require_torch` plus requires 0 or 1 GPUs
+- `require_torch_up_to_2_gpus` - as `require_torch` plus requires 0 or 1 or 2 GPUs
+- `require_torch_xla` - as `require_torch` plus requires at least 1 TPU
+
+Let's depict the GPU requirements in the following table:
+
+
+| n gpus | decorator                      |
+|--------|--------------------------------|
+| `>= 0` | `@require_torch`               |
+| `>= 1` | `@require_torch_gpu`           |
+| `>= 2` | `@require_torch_multi_gpu`     |
+| `< 2`  | `@require_torch_non_multi_gpu` |
+| `< 3`  | `@require_torch_up_to_2_gpus`  |
+
+
+For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
+
+```python no-style
+@require_torch_multi_gpu
+def test_example_with_multi_gpu():
+```
+
+If a test requires `tensorflow` use the `require_tf` decorator. For example:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
+These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
+how to set it up:
+
+```python no-style
+@require_torch_gpu
+@slow
+def test_example_slow_on_gpu():
+```
+
+Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
+last for them to work correctly. Here is an example of the correct usage:
+
+```python no-style
+@parameterized.expand(...)
+@require_torch_multi_gpu
+def test_integration_foo():
+```
+
+This order problem doesn't exist with `@pytest.mark.parametrize`, you can put it first or last and it will still
+work. But it only works with non-unittests.
+
+Inside tests:
+
+- How many GPUs are available:
+
+```python
+from transformers.testing_utils import get_gpu_count
+
+n_gpu = get_gpu_count()  # works with torch and tf
+```
+
+### Testing with a specific PyTorch backend or device
+
+To run the test suite on a specific torch device add `TRANSFORMERS_TEST_DEVICE="$device"` where `$device` is the target backend. For example, to test on CPU only:
+
+```bash
+TRANSFORMERS_TEST_DEVICE="cpu" pytest tests/utils/test_logging.py
+```
+
+This variable is useful for testing custom or less common PyTorch backends such as `mps`, `xpu` or `npu`. It can also be used to achieve the same effect as `CUDA_VISIBLE_DEVICES` by targeting specific GPUs or testing in CPU-only mode.
+
+Certain devices will require an additional import after importing `torch` for the first time. This can be specified using the environment variable `TRANSFORMERS_TEST_BACKEND`:
+
+```bash
+TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
+```
+Alternative backends may also require the replacement of device-specific functions. For example `torch.cuda.manual_seed` may need to be replaced with a device-specific seed setter like `torch.npu.manual_seed` or `torch.xpu.manual_seed` to correctly set a random seed on the device. To specify a new backend with backend-specific device functions when running the test suite, create a Python device specification file `spec.py` in the format:
+
+```python
+import torch
+import torch_npu # for xpu, replace it with `import intel_extension_for_pytorch`
+# !! Further additional imports can be added here !!
+
+# Specify the device name (eg. 'cuda', 'cpu', 'npu', 'xpu', 'mps')
+DEVICE_NAME = 'npu'
+
+# Specify device-specific backends to dispatch to.
+# If not specified, will fallback to 'default' in 'testing_utils.py`
+MANUAL_SEED_FN = torch.npu.manual_seed
+EMPTY_CACHE_FN = torch.npu.empty_cache
+DEVICE_COUNT_FN = torch.npu.device_count
+```
+This format also allows for specification of any additional imports required. To use this file to replace equivalent methods in the test suite, set the environment variable `TRANSFORMERS_TEST_DEVICE_SPEC` to the path of the spec file, e.g. `TRANSFORMERS_TEST_DEVICE_SPEC=spec.py`.
+
+Currently, only `MANUAL_SEED_FN`, `EMPTY_CACHE_FN` and `DEVICE_COUNT_FN` are supported for device-specific dispatch.
+
+### Distributed training
+
+`pytest` can't deal with distributed training directly. If this is attempted - the sub-processes don't do the right
+thing and end up thinking they are `pytest` and start running the test suite in loops. It works, however, if one
+spawns a normal process that then spawns off multiple workers and manages the IO pipes.
+
+Here are some tests that use it:
+
+- [test_trainer_distributed.py](https://github.com/huggingface/transformers/tree/main/tests/trainer/test_trainer_distributed.py)
+- [test_deepspeed.py](https://github.com/huggingface/transformers/tree/main/tests/deepspeed/test_deepspeed.py)
+
+To jump right into the execution point, search for the `execute_subprocess_async` call in those tests.
+
+You will need at least 2 GPUs to see these tests in action:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py
+```
+
+### Output capture
+
+During test execution any output sent to `stdout` and `stderr` is captured. If a test or a setup method fails, its
+according captured output will usually be shown along with the failure traceback.
+
+To disable output capturing and to get the `stdout` and `stderr` normally, use `-s` or `--capture=no`:
+
+```bash
+pytest -s tests/utils/test_logging.py
+```
+
+To send test results to JUnit format output:
+
+```bash
+pytest tests --junitxml=result.xml
+```
+
+### Color control
+
+To have no color (e.g., yellow on white background is not readable):
+
+```bash
+pytest --color=no tests/utils/test_logging.py
+```
+
+### Sending test report to online pastebin service
+
+Creating a URL for each test failure:
+
+```bash
+pytest --pastebin=failed tests/utils/test_logging.py
+```
+
+This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
+tests as usual or add for example -x if you only want to send one particular failure.
+
+Creating a URL for a whole test session log:
+
+```bash
+pytest --pastebin=all tests/utils/test_logging.py
+```
+
+## Writing tests
+
+🤗 transformers tests are based on `unittest`, but run by `pytest`, so most of the time features from both systems
+can be used.
+
+You can read [here](https://docs.pytest.org/en/stable/unittest.html) which features are supported, but the important
+thing to remember is that most `pytest` fixtures don't work. Neither parametrization, but we use the module
+`parameterized` that works in a similar way.
+
+
+### Parametrization
+
+Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within
+the test, but then there is no way of running that test for just one set of arguments.
+
+```python
+# test_this1.py
+import unittest
+from parameterized import parameterized
+
+
+class TestMathUnitTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
+    def test_floor(self, name, input, expected):
+        assert_equal(math.floor(input), expected)
+```
+
+Now, by default this test will be run 3 times, each time with the last 3 arguments of `test_floor` being assigned the
+corresponding arguments in the parameter list.
+
+and you could run just the `negative` and `integer` sets of params with:
+
+```bash
+pytest -k "negative and integer" tests/test_mytest.py
+```
+
+or all but `negative` sub-tests, with:
+
+```bash
+pytest -k "not negative" tests/test_mytest.py
+```
+
+Besides using the `-k` filter that was just mentioned, you can find out the exact name of each sub-test and run any
+or all of them using their exact names.
+
+```bash
+pytest test_this1.py --collect-only -q
+```
+
+and it will list:
+
+```bash
+test_this1.py::TestMathUnitTest::test_floor_0_negative
+test_this1.py::TestMathUnitTest::test_floor_1_integer
+test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+```
+
+So now you can run just 2 specific sub-tests:
+
+```bash
+pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+```
+
+The module [parameterized](https://pypi.org/project/parameterized/) which is already in the developer dependencies
+of `transformers` works for both: `unittests` and `pytest` tests.
+
+If, however, the test is not a `unittest`, you may use `pytest.mark.parametrize` (or you may see it being used in
+some existing tests, mostly under `examples`).
+
+Here is the same example, this time using `pytest`'s `parametrize` marker:
+
+```python
+# test_this2.py
+import pytest
+
+
+@pytest.mark.parametrize(
+    "name, input, expected",
+    [
+        ("negative", -1.5, -2.0),
+        ("integer", 1, 1.0),
+        ("large fraction", 1.6, 1),
+    ],
+)
+def test_floor(name, input, expected):
+    assert_equal(math.floor(input), expected)
+```
+
+Same as with `parameterized`, with `pytest.mark.parametrize` you can have a fine control over which sub-tests are
+run, if the `-k` filter doesn't do the job. Except, this parametrization function creates a slightly different set of
+names for the sub-tests. Here is what they look like:
+
+```bash
+pytest test_this2.py --collect-only -q
+```
+
+and it will list:
+
+```bash
+test_this2.py::test_floor[integer-1-1.0]
+test_this2.py::test_floor[negative--1.5--2.0]
+test_this2.py::test_floor[large fraction-1.6-1]
+```
+
+So now you can run just the specific test:
+
+```bash
+pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+```
+
+as in the previous example.
+
+
+
+### Files and directories
+
+In tests often we need to know where things are relative to the current test file, and it's not trivial since the test
+could be invoked from more than one directory or could reside in sub-directories with different depths. A helper class
+`transformers.test_utils.TestCasePlus` solves this problem by sorting out all the basic paths and provides easy
+accessors to them:
+
+- `pathlib` objects (all fully resolved):
+
+  - `test_file_path` - the current test file path, i.e. `__file__`
+  - `test_file_dir` - the directory containing the current test file
+  - `tests_dir` - the directory of the `tests` test suite
+  - `examples_dir` - the directory of the `examples` test suite
+  - `repo_root_dir` - the directory of the repository
+  - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides)
+
+- stringified paths---same as above but these return paths as strings, rather than `pathlib` objects:
+
+  - `test_file_path_str`
+  - `test_file_dir_str`
+  - `tests_dir_str`
+  - `examples_dir_str`
+  - `repo_root_dir_str`
+  - `src_dir_str`
+
+To start using those all you need is to make sure that the test resides in a subclass of
+`transformers.test_utils.TestCasePlus`. For example:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_local_locations(self):
+        data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
+```
+
+If you don't need to manipulate paths via `pathlib` or you just need a path as a string, you can always invoked
+`str()` on the `pathlib` object or use the accessors ending with `_str`. For example:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class PathExampleTest(TestCasePlus):
+    def test_something_involving_stringified_locations(self):
+        examples_dir = self.examples_dir_str
+```
+
+### Temporary files and directories
+
+Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite
+each other's data. Also we want to get the temporary files and directories removed at the end of each test that created
+them. Therefore, using packages like `tempfile`, which address these needs is essential.
+
+However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want
+to know it's exact path and not having it randomized on every test re-run.
+
+A helper class `transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of
+`unittest.TestCase`, so we can easily inherit from it in the test modules.
+
+Here is an example of its usage:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+This code creates a unique temporary directory, and sets `tmp_dir` to its location.
+
+- Create a unique temporary dir:
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir()
+```
+
+`tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the
+test.
+
+- Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test.
+
+```python
+def test_whatever(self):
+    tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+```
+
+This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't
+leave any data in there.
+
+- You can override the default behavior by directly overriding the `before` and `after` args, leading to one of the
+  following behaviors:
+
+  - `before=True`: the temporary dir will always be cleared at the beginning of the test.
+  - `before=False`: if the temporary dir already existed, any existing files will remain there.
+  - `after=True`: the temporary dir will always be deleted at the end of the test.
+  - `after=False`: the temporary dir will always be left intact at the end of the test.
+
+<Tip>
+
+In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are allowed if
+an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem will
+get nuked. i.e. please always pass paths that start with `./`.
+
+</Tip>
+
+<Tip>
+
+Each test can register multiple temporary directories and they all will get auto-removed, unless requested
+otherwise.
+
+</Tip>
+
+### Temporary sys.path override
+
+If you need to temporary override `sys.path` to import from another test for example, you can use the
+`ExtendSysPath` context manager. Example:
+
+
+```python
+import os
+from transformers.testing_utils import ExtendSysPath
+
+bindir = os.path.abspath(os.path.dirname(__file__))
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+```
+
+### Skipping tests
+
+This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. In order to be able to
+commit it to the main repository we need make sure it's skipped during `make test`.
+
+Methods:
+
+-  A **skip** means that you expect your test to pass only if some conditions are met, otherwise pytest should skip
+  running the test altogether. Common examples are skipping windows-only tests on non-windows platforms, or skipping
+  tests that depend on an external resource which is not available at the moment (for example a database).
+
+-  A **xfail** means that you expect a test to fail for some reason. A common example is a test for a feature not yet
+  implemented, or a bug not yet fixed. When a test passes despite being expected to fail (marked with
+  pytest.mark.xfail), it’s an xpass and will be reported in the test summary.
+
+One of the important differences between the two is that `skip` doesn't run the test, and `xfail` does. So if the
+code that's buggy causes some bad state that will affect other tests, do not use `xfail`.
+
+#### Implementation
+
+- Here is how to skip whole test unconditionally:
+
+```python no-style
+@unittest.skip(reason="this bug needs to be fixed")
+def test_feature_x():
+```
+
+or via pytest:
+
+```python no-style
+@pytest.mark.skip(reason="this bug needs to be fixed")
+```
+
+or the `xfail` way:
+
+```python no-style
+@pytest.mark.xfail
+def test_feature_x():
+```
+
+
+Here's how to skip a test based on internal checks within the test:
+
+```python
+def test_feature_x():
+    if not has_something():
+        pytest.skip("unsupported configuration")
+```
+
+or the whole module:
+
+```python
+import pytest
+
+if not pytest.config.getoption("--custom-flag"):
+    pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+```
+
+or the `xfail` way:
+
+```python
+def test_feature_x():
+    pytest.xfail("expected to fail until bug XYZ is fixed")
+```
+
+- Here is how to skip all tests in a module if some import is missing:
+
+```python
+docutils = pytest.importorskip("docutils", minversion="0.3")
+```
+
+-  Skip a test based on a condition:
+
+```python no-style
+@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+def test_feature_x():
+```
+
+or:
+
+```python no-style
+@unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+def test_feature_x():
+```
+
+or skip the whole module:
+
+```python no-style
+@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+class TestClass():
+    def test_feature_x(self):
+```
+
+More details, example and ways are [here](https://docs.pytest.org/en/latest/skipping.html).
+
+### Slow tests
+
+The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for
+an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
+marked as in the example below:
+
+```python no-style
+from transformers.testing_utils import slow
+@slow
+def test_integration_foo():
+```
+
+Once a test is marked as `@slow`, to run such tests set `RUN_SLOW=1` env var, e.g.:
+
+```bash
+RUN_SLOW=1 pytest tests
+```
+
+Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
+`@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
+
+```python no-style
+@parameterized.expand(...)
+@slow
+def test_integration_foo():
+```
+
+As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI
+checks. So it's possible that some problems will be missed during a PR submission and get merged. Such problems will
+get caught during the next scheduled CI job. But it also means that it's important to run the slow tests on your
+machine before submitting the PR.
+
+Here is a rough decision making mechanism for choosing which tests should be marked as slow:
+
+If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files,
+pipelines), then we should run that test in the non-slow test suite. If it's focused on an other aspect of the library,
+such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine
+this approach we should have exceptions:
+
+- All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or
+  tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you
+  should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is
+  discussed in the following paragraphs.
+- All tests that need to do a training not specifically optimized to be fast should be set to slow.
+- We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to
+  `@slow`. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked
+  as `@slow`.
+- If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless.
+
+Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. For example,
+a significant coverage can be achieved by testing with specially created tiny models with random weights. Such models
+have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. Then the `@slow` tests can use large
+slow models to do qualitative testing. To see the use of these simply look for *tiny* models with:
+
+```bash
+grep tiny tests examples
+```
+
+Here is an example of a [script](https://github.com/huggingface/transformers/tree/main/scripts/fsmt/fsmt-make-tiny-model.py) that created the tiny model
+[stas/tiny-wmt19-en-de](https://huggingface.co/stas/tiny-wmt19-en-de). You can easily adjust it to your specific
+model's architecture.
+
+It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if
+you test it locally the downloaded files would be cached and thus the download time not measured. Hence check the
+execution speed report in CI logs instead (the output of `pytest --durations=0 tests`).
+
+That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast.
+If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest
+tests.
+
+
+### Testing the stdout/stderr output
+
+In order to test functions that write to `stdout` and/or `stderr`, the test can access those streams using the
+`pytest`'s [capsys system](https://docs.pytest.org/en/latest/capture.html). Here is how this is accomplished:
+
+```python
+import sys
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
+def test_result_and_stdout(capsys):
+    msg = "Hello"
+    print_to_stdout(msg)
+    print_to_stderr(msg)
+    out, err = capsys.readouterr()  # consume the captured output streams
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    sys.stderr.write(err)
+    # test:
+    assert msg in out
+    assert msg in err
+```
+
+And, of course, most of the time, `stderr` will come as a part of an exception, so try/except has to be used in such
+a case:
+
+```python
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
+def test_something_exception():
+    msg = "Not a good value"
+    error = ""
+    try:
+        raise_exception(msg)
+    except Exception as e:
+        error = str(e)
+        assert msg in error, f"{msg} is in the exception:\n{error}"
+```
+
+Another approach to capturing stdout is via `contextlib.redirect_stdout`:
+
+```python
+from io import StringIO
+from contextlib import redirect_stdout
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def test_result_and_stdout():
+    msg = "Hello"
+    buffer = StringIO()
+    with redirect_stdout(buffer):
+        print_to_stdout(msg)
+    out = buffer.getvalue()
+    # optional: if you want to replay the consumed streams:
+    sys.stdout.write(out)
+    # test:
+    assert msg in out
+```
+
+An important potential issue with capturing stdout is that it may contain `\r` characters that in normal `print`
+reset everything that has been printed so far. There is no problem with `pytest`, but with `pytest -s` these
+characters get included in the buffer, so to be able to have the test run with and without `-s`, you have to make an
+extra cleanup to the captured output, using `re.sub(r'~.*\r', '', buf, 0, re.M)`.
+
+But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has
+some `\r`'s in it or not, so it's a simple:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+with CaptureStdout() as cs:
+    function_that_writes_to_stdout()
+print(cs.out)
+```
+
+Here is a full test example:
+
+```python
+from transformers.testing_utils import CaptureStdout
+
+msg = "Secret message\r"
+final = "Hello World"
+with CaptureStdout() as cs:
+    print(msg + final)
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
+```
+
+If you'd like to capture `stderr` use the `CaptureStderr` class instead:
+
+```python
+from transformers.testing_utils import CaptureStderr
+
+with CaptureStderr() as cs:
+    function_that_writes_to_stderr()
+print(cs.err)
+```
+
+If you need to capture both streams at once, use the parent `CaptureStd` class:
+
+```python
+from transformers.testing_utils import CaptureStd
+
+with CaptureStd() as cs:
+    function_that_writes_to_stdout_and_stderr()
+print(cs.err, cs.out)
+```
+
+Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit
+from the context.
+
+
+### Capturing logger stream
+
+If you need to validate the output of a logger, you can use `CaptureLogger`:
+
+```python
+from transformers import logging
+from transformers.testing_utils import CaptureLogger
+
+msg = "Testing 1, 2, 3"
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+with CaptureLogger(logger) as cl:
+    logger.info(msg)
+assert cl.out, msg + "\n"
+```
+
+### Testing with environment variables
+
+If you want to test the impact of environment variables for a specific test you can use a helper decorator
+`transformers.testing_utils.mockenv`
+
+```python
+from transformers.testing_utils import mockenv
+
+
+class HfArgumentParserTest(unittest.TestCase):
+    @mockenv(TRANSFORMERS_VERBOSITY="error")
+    def test_env_override(self):
+        env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+```
+
+At times an external program needs to be called, which requires setting `PYTHONPATH` in `os.environ` to include
+multiple local paths. A helper class `transformers.test_utils.TestCasePlus` comes to help:
+
+```python
+from transformers.testing_utils import TestCasePlus
+
+
+class EnvExampleTest(TestCasePlus):
+    def test_external_prog(self):
+        env = self.get_env()
+        # now call the external program, passing `env` to it
+```
+
+Depending on whether the test file was under the `tests` test suite or `examples` it'll correctly set up
+`env[PYTHONPATH]` to include one of these two directories, and also the `src` directory to ensure the testing is
+done against the current repo, and finally with whatever `env[PYTHONPATH]` was already set to before the test was
+called if anything.
+
+This helper method creates a copy of the `os.environ` object, so the original remains intact.
+
+
+### Getting reproducible results
+
+In some situations you may want to remove randomness for your tests. To get identical reproducible results set, you
+will need to fix the seed:
+
+```python
+seed = 42
+
+# python RNG
+import random
+
+random.seed(seed)
+
+# pytorch RNGs
+import torch
+
+torch.manual_seed(seed)
+torch.backends.cudnn.deterministic = True
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
+
+# numpy RNG
+import numpy as np
+
+np.random.seed(seed)
+
+# tf RNG
+import tensorflow as tf 
+
+tf.random.set_seed(seed)
+```
+
+### Debugging tests
+
+To start a debugger at the point of the warning, do this:
+
+```bash
+pytest tests/utils/test_logging.py -W error::UserWarning --pdb
+```
+
+## Working with github actions workflows
+
+To trigger a self-push workflow CI job, you must:
+
+1. Create a new branch on `transformers` origin (not a fork!).
+2. The branch name has to start with either `ci_` or `ci-` (`main` triggers it too, but we can't do PRs on
+   `main`). It also gets triggered only for specific paths - you can find the up-to-date definition in case it
+   changed since this document has been written [here](https://github.com/huggingface/transformers/blob/main/.github/workflows/self-push.yml) under *push:*
+3. Create a PR from this branch.
+4. Then you can see the job appear [here](https://github.com/huggingface/transformers/actions/workflows/self-push.yml). It may not run right away if there
+   is a backlog.
+
+
+
+
+## Testing Experimental CI Features
+
+Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a
+new CI feature is to be added, it should be done as following.
+
+1. Create a new dedicated job that tests what needs to be tested
+2. The new job must always succeed so that it gives us a green ✓ (details below).
+3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches,
+   non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there
+   are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always
+   green)
+4. When it's clear that everything is solid, then merge the new changes into existing jobs.
+
+That way experiments on CI functionality itself won't interfere with the normal workflow.
+
+Now how can we make the job always succeed while the new CI feature is being developed?
+
+Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and
+Github Actions as of this writing don't support that.
+
+So the following workaround can be used:
+
+1. `set +euo pipefail` at the beginning of the run command to suppress most potential failures in the bash script.
+2. the last command must be a success: `echo "done"` or just `true` will do
+
+Here is an example:
+
+```yaml
+- run:
+    name: run CI experiment
+    command: |
+        set +euo pipefail
+        echo "setting run-all-despite-any-errors-mode"
+        this_command_will_fail
+        echo "but bash continues to run"
+        # emulate another failure
+        false
+        # but the last command must be a success
+        echo "during experiment do not remove: reporting success to CI, even if there were failures"
+```
+
+For simple commands you could also do:
+
+```bash
+cmd_that_may_fail || true
+```
+
+Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs,
+while removing `set +euo pipefail` or any other things you may have added to ensure that the experimental job doesn't
+interfere with the normal CI functioning.
+
+This whole process would have been much easier if we only could set something like `allow-failure` for the
+experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
+Github Actions don't support it at the moment.
+
+You can vote for this feature and see where it is at these CI-specific threads:
+
+- [Github Actions:](https://github.com/actions/toolkit/issues/399)
+- [CircleCI:](https://ideas.circleci.com/ideas/CCI-I-344)
+
+## DeepSpeed integration
+
+For a PR that involves the DeepSpeed integration, keep in mind our CircleCI PR CI setup doesn't have GPUs. Tests requiring GPUs are run on a different CI nightly. This means if you get a passing CI report in your PR, it doesn’t mean the DeepSpeed tests pass.
+
+To run DeepSpeed tests:
+
+```bash
+RUN_SLOW=1 pytest tests/deepspeed/test_deepspeed.py
+```
+
+Any changes to the modeling or PyTorch examples code requires running the model zoo tests as well.
+
+```bash
+RUN_SLOW=1 pytest tests/deepspeed
+```
diff --git a/docs/transformers/docs/source/en/tf_xla.md b/docs/transformers/docs/source/en/tf_xla.md
new file mode 100644
index 0000000000000000000000000000000000000000..c8fb13ff6aecc903efb97c93bd6d8c1a32dee970
--- /dev/null
+++ b/docs/transformers/docs/source/en/tf_xla.md
@@ -0,0 +1,129 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLA
+
+[[open-in-colab]]
+
+[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks.
+
+This guide will look specifically at how to accelerate *TensorFlow* models with XLA.
+
+## TensorFlow
+
+XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method).
+
+```py
+model.compile(jit_compile=True)
+```
+
+XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up.
+
+### Functions
+
+A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`.
+
+```diff
+import tensorflow as tf
+
+model = tf.keras.Sequential(
+    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
+)
+# Generate random inputs for the model.
+batch_size = 16
+input_vector_dim = 10
+random_inputs = tf.random.normal((batch_size, input_vector_dim))
+
+# Run a forward pass.
+- _ = model(random_inputs)
++ xla_fn = tf.function(model, jit_compile=True)
++ _ = xla_fn(random_inputs)
+```
+
+The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+```py
+my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
+```
+
+### Text generation
+
+You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+# Will error if the minimal version of Transformers is not installed.
+from transformers.utils import check_min_version
+
+check_min_version("4.21.0")
+
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+tokenized_input = tokenizer(input_string, return_tensors="tf")
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the"
+```
+
+## Tracing
+
+When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again.
+
+To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text.
+
+One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer.
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+# Call tokenizer with padding options.
+tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+```
+
+In addition to the input shape, any changes to the generation options at any point also triggers tracing.
+
+## Resources
+
+Learn more about XLA with the following resources.
+
+- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models.
+- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow.
+- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers.
+- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide.
+- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide.
+- The [XLA](https://openxla.org/xla) documentation.
diff --git a/docs/transformers/docs/source/en/tflite.md b/docs/transformers/docs/source/en/tflite.md
new file mode 100644
index 0000000000000000000000000000000000000000..c1ab9618b4367322bf0d406f2f44f67a230c8ea2
--- /dev/null
+++ b/docs/transformers/docs/source/en/tflite.md
@@ -0,0 +1,66 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LiteRT
+
+[LiteRT](https://ai.google.dev/edge/litert) (previously known as TensorFlow Lite) is a high-performance runtime designed for on-device machine learning.
+
+The [Optimum](https://huggingface.co/docs/optimum/index) library exports a model to LiteRT for [many architectures]((https://huggingface.co/docs/optimum/exporters/onnx/overview)).
+
+The benefits of exporting to LiteRT include the following.
+
+- Low-latency, privacy-focused, no internet connectivity required, and reduced model size and power consumption for on-device machine learning.
+- Broad platform, model framework, and language support.
+- Hardware acceleration for GPUs and Apple Silicon.
+
+Export a Transformers model to LiteRT with the Optimum CLI.
+
+Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module for LiteRT.
+
+```bash
+pip install optimum[exporters-tf]
+```
+
+> [!TIP]
+> Refer to the [Export a model to TFLite with optimum.exporters.tflite](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) guide for all available arguments or with the command below.
+> ```bash
+> optimum-cli export tflite --help
+> ```
+
+Set the `--model` argument to export a from the Hub.
+
+```bash
+optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+You should see logs indicating the progress and showing where the resulting `model.tflite` is saved.
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+ ```
+
+For local models, make sure the model weights and tokenizer files are saved in the same directory, for example `local_path`. Pass the directory to the `--model` argument and use `--task` to indicate the [task](https://huggingface.co/docs/optimum/exporters/task_manager) a model can perform. If `--task` isn't provided, the model architecture without a task-specific head is used.
+
+```bash
+optimum-cli export tflite --model local_path --task question-answering google-bert/bert-base-uncased --sequence_length 128 bert_tflite/
+```
diff --git a/docs/transformers/docs/source/en/tokenizer_summary.md b/docs/transformers/docs/source/en/tokenizer_summary.md
new file mode 100644
index 0000000000000000000000000000000000000000..c5f12dd20d20ed6750aa070d95e519690bf686a7
--- /dev/null
+++ b/docs/transformers/docs/source/en/tokenizer_summary.md
@@ -0,0 +1,282 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Summary of the tokenizers
+
+[[open-in-colab]]
+
+On this page, we will have a closer look at tokenization.
+
+<Youtube id="VFp38yj8h3A"/>
+
+As we saw in [the preprocessing tutorial](preprocessing), tokenizing a text is splitting it into words or
+subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is
+straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text).
+More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: [Byte-Pair Encoding
+(BPE)](#byte-pair-encoding), [WordPiece](#wordpiece), and [SentencePiece](#sentencepiece), and show examples
+of which tokenizer type is used by which model.
+
+Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer
+type was used by the pretrained model. For instance, if we look at [`BertTokenizer`], we can see
+that the model uses [WordPiece](#wordpiece).
+
+## Introduction
+
+Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so.
+For instance, let's look at the sentence `"Don't you love 🤗 Transformers? We sure do."`
+
+<Youtube id="nhJxYji1aho"/>
+
+A simple way of tokenizing this text is to split it by spaces, which would give:
+
+```
+["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+```
+
+This is a sensible first step, but if we look at the tokens `"Transformers?"` and `"do."`, we notice that the
+punctuation is attached to the words `"Transformer"` and `"do"`, which is suboptimal. We should take the
+punctuation into account so that a model does not have to learn a different representation of a word and every possible
+punctuation symbol that could follow it, which would explode the number of representations the model has to learn.
+Taking punctuation into account, tokenizing our exemplary text would give:
+
+```
+["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+Better. However, it is disadvantageous, how the tokenization dealt with the word `"Don't"`. `"Don't"` stands for
+`"do not"`, so it would be better tokenized as `["Do", "n't"]`. This is where things start getting complicated, and
+part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a
+different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an
+input that was tokenized with the same rules that were used to tokenize its training data.
+
+[spaCy](https://spacy.io/) and [Moses](http://www.statmt.org/moses/?n=Development.GetStarted) are two popular
+rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like:
+
+```
+["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+```
+
+As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and
+punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined
+as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this
+tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization
+usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, [Transformer XL](model_doc/transfo-xl) uses space and punctuation tokenization, resulting in a vocabulary size of 267,735!
+
+Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which
+causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size
+greater than 50,000, especially if they are pretrained only on a single language.
+
+So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters?
+
+<Youtube id="ssLq_EK2jLE"/>
+
+While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder
+for the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent
+representation for the letter `"t"` is much harder than learning a context-independent representation for the word
+`"today"`. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of
+both worlds, transformers models use a hybrid between word-level and character-level tokenization called **subword**
+tokenization.
+
+## Subword tokenization
+
+<Youtube id="zHvTiHr506c"/>
+
+Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller
+subwords, but rare words should be decomposed into meaningful subwords. For instance `"annoyingly"` might be
+considered a rare word and could be decomposed into `"annoying"` and `"ly"`. Both `"annoying"` and `"ly"` as
+stand-alone subwords would appear more frequently while at the same time the meaning of `"annoyingly"` is kept by the
+composite meaning of `"annoying"` and `"ly"`. This is especially useful in agglutinative languages such as Turkish,
+where you can form (almost) arbitrarily long complex words by stringing together subwords.
+
+Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful
+context-independent representations. In addition, subword tokenization enables the model to process words it has never
+seen before, by decomposing them into known subwords. For instance, the [`~transformers.BertTokenizer`] tokenizes
+`"I have a new GPU!"` as follows:
+
+```py
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+>>> tokenizer.tokenize("I have a new GPU!")
+["i", "have", "a", "new", "gp", "##u", "!"]
+```
+
+Because we are considering the uncased model, the sentence was lowercased first. We can see that the words `["i", "have", "a", "new"]` are present in the tokenizer's vocabulary, but the word `"gpu"` is not. Consequently, the
+tokenizer splits `"gpu"` into known subwords: `["gp" and "##u"]`. `"##"` means that the rest of the token should
+be attached to the previous one, without space (for decoding or reversal of the tokenization).
+
+As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously exemplary text as follows:
+
+```py
+>>> from transformers import XLNetTokenizer
+
+>>> tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased")
+>>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
+```
+
+We'll get back to the meaning of those `"▁"` when we look at [SentencePiece](#sentencepiece). As one can see,
+the rare word `"Transformers"` has been split into the more frequent subwords `"Transform"` and `"ers"`.
+
+Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization
+algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained
+on.
+
+<a id='byte-pair-encoding'></a>
+
+### Byte-Pair Encoding (BPE)
+
+Byte-Pair Encoding (BPE) was introduced in [Neural Machine Translation of Rare Words with Subword Units (Sennrich et
+al., 2015)](https://arxiv.org/abs/1508.07909). BPE relies on a pre-tokenizer that splits the training data into
+words. Pretokenization can be as simple as space tokenization, e.g. [GPT-2](model_doc/gpt2), [RoBERTa](model_doc/roberta). More advanced pre-tokenization include rule-based tokenization, e.g. [XLM](model_doc/xlm),
+[FlauBERT](model_doc/flaubert) which uses Moses for most languages, or [GPT](model_doc/openai-gpt) which uses
+spaCy and ftfy, to count the frequency of each word in the training corpus.
+
+After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the
+training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
+of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
+the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to
+define before training the tokenizer.
+
+As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been
+determined:
+
+```
+("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)
+```
+
+Consequently, the base vocabulary is `["b", "g", "h", "n", "p", "s", "u"]`. Splitting all words into symbols of the
+base vocabulary, we obtain:
+
+```
+("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5)
+```
+
+BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
+the example above `"h"` followed by `"u"` is present _10 + 5 = 15_ times (10 times in the 10 occurrences of
+`"hug"`, 5 times in the 5 occurrences of `"hugs"`). However, the most frequent symbol pair is `"u"` followed by
+`"g"`, occurring _10 + 5 + 5 = 20_ times in total. Thus, the first merge rule the tokenizer learns is to group all
+`"u"` symbols followed by a `"g"` symbol together. Next, `"ug"` is added to the vocabulary. The set of words then
+becomes
+
+```
+("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5)
+```
+
+BPE then identifies the next most common symbol pair. It's `"u"` followed by `"n"`, which occurs 16 times. `"u"`,
+`"n"` is merged to `"un"` and added to the vocabulary. The next most frequent symbol pair is `"h"` followed by
+`"ug"`, occurring 15 times. Again the pair is merged and `"hug"` can be added to the vocabulary.
+
+At this stage, the vocabulary is `["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]` and our set of unique words
+is represented as
+
+```
+("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)
+```
+
+Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied
+to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance,
+the word `"bug"` would be tokenized to `["b", "ug"]` but `"mug"` would be tokenized as `["<unk>", "ug"]` since
+the symbol `"m"` is not in the base vocabulary. In general, single letters such as `"m"` are not replaced by the
+`"<unk>"` symbol because the training data usually includes at least one occurrence of each letter, but it is likely
+to happen for very special characters like emojis.
+
+As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter
+to choose. For instance [GPT](model_doc/openai-gpt) has a vocabulary size of 40,478 since they have 478 base characters
+and chose to stop training after 40,000 merges.
+
+#### Byte-level BPE
+
+A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are
+considered as base characters. To have a better base vocabulary, [GPT-2](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) uses bytes
+as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that
+every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's
+tokenizer can tokenize every text without the need for the <unk> symbol. [GPT-2](model_doc/gpt) has a vocabulary
+size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned
+with 50,000 merges.
+
+<a id='wordpiece'></a>
+
+### WordPiece
+
+WordPiece is the subword tokenization algorithm used for [BERT](model_doc/bert), [DistilBERT](model_doc/distilbert), and [Electra](model_doc/electra). The algorithm was outlined in [Japanese and Korean
+Voice Search (Schuster et al., 2012)](https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf) and is very similar to
+BPE. WordPiece first initializes the vocabulary to include every character present in the training data and
+progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent
+symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary.
+
+So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is
+equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
+its second symbol is the greatest among all symbol pairs. *E.g.* `"u"`, followed by `"g"` would have only been
+merged if the probability of `"ug"` divided by `"u"`, `"g"` would have been greater than for any other symbol
+pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it _loses_ by merging two symbols
+to ensure it's _worth it_.
+
+<a id='unigram'></a>
+
+### Unigram
+
+Unigram is a subword tokenization algorithm introduced in [Subword Regularization: Improving Neural Network Translation
+Models with Multiple Subword Candidates (Kudo, 2018)](https://arxiv.org/pdf/1804.10959.pdf). In contrast to BPE or
+WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each
+symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and
+the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in
+conjunction with [SentencePiece](#sentencepiece).
+
+At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
+data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm
+computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then
+removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those
+symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has
+reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized.
+
+Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
+tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:
+
+```
+["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"],
+```
+
+`"hugs"` could be tokenized both as `["hug", "s"]`, `["h", "ug", "s"]` or `["h", "u", "g", "s"]`. So which one
+to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that
+the probability of each possible tokenization can be computed after training. The algorithm simply picks the most
+likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their
+probabilities.
+
+Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of
+the words \\(x_{1}, \dots, x_{N}\\) and that the set of all possible tokenizations for a word \\(x_{i}\\) is
+defined as \\(S(x_{i})\\), then the overall loss is defined as
+
+$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$
+
+<a id='sentencepiece'></a>
+
+### SentencePiece
+
+All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to
+separate words. However, not all languages use spaces to separate words. One possible solution is to use language
+specific pre-tokenizers, *e.g.* [XLM](model_doc/xlm) uses a specific Chinese, Japanese, and Thai pre-tokenizer.
+To solve this problem more generally, [SentencePiece: A simple and language independent subword tokenizer and
+detokenizer for Neural Text Processing (Kudo et al., 2018)](https://arxiv.org/pdf/1808.06226.pdf) treats the input
+as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram
+algorithm to construct the appropriate vocabulary.
+
+The [`XLNetTokenizer`] uses SentencePiece for example, which is also why in the example earlier the
+`"▁"` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be
+concatenated and `"▁"` is replaced by a space.
+
+All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models
+using SentencePiece are [ALBERT](model_doc/albert), [XLNet](model_doc/xlnet), [Marian](model_doc/marian), and [T5](model_doc/t5).
diff --git a/docs/transformers/docs/source/en/tools.md b/docs/transformers/docs/source/en/tools.md
new file mode 100644
index 0000000000000000000000000000000000000000..af2e9df1e8c96afaa4daa92899ea8eeb5368967d
--- /dev/null
+++ b/docs/transformers/docs/source/en/tools.md
@@ -0,0 +1,18 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+> [!WARNING]
+> Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.